From af2a5544875b23b3b62fb6d4f3bf432828720008 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Thu, 8 Oct 2015 12:42:10 -0700
Subject: [PATCH 001/139] [SPARK-10337] [SQL] fix hive views on
 non-hive-compatible tables.

add a new config to deal with this special case.

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8990 from cloud-fan/view-master.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  |  15 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  23 +++
 .../org/apache/spark/sql/hive/HiveQl.scala    | 164 +++++++++++++++---
 .../sql/hive/client/ClientInterface.scala     |  13 +-
 .../spark/sql/hive/client/ClientWrapper.scala |  31 ++++
 .../hive/execution/CreateViewAsSelect.scala   |  97 +++++++++++
 .../sql/hive/execution/SQLQuerySuite.scala    | 117 +++++++++++++
 7 files changed, 433 insertions(+), 27 deletions(-)
 create mode 100644 sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index e7bbc7d5db493..8f0f8910b36ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -319,6 +319,15 @@ private[spark] object SQLConf {
     doc = "When true, some predicates will be pushed down into the Hive metastore so that " +
           "unmatching partitions can be eliminated earlier.")
 
+  val CANONICALIZE_VIEW = booleanConf("spark.sql.canonicalizeView",
+    defaultValue = Some(false),
+    doc = "When true, CREATE VIEW will be handled by Spark SQL instead of Hive native commands.  " +
+          "Note that this function is experimental and should ony be used when you are using " +
+          "non-hive-compatible tables written by Spark SQL.  The SQL string used to create " +
+          "view should be fully qualified, i.e. use `tbl1`.`col1` instead of `*` whenever " +
+          "possible, or you may get wrong result.",
+    isPublic = false)
+
   val COLUMN_NAME_OF_CORRUPT_RECORD = stringConf("spark.sql.columnNameOfCorruptRecord",
     defaultValue = Some("_corrupt_record"),
     doc = "<TODO>")
@@ -362,7 +371,7 @@ private[spark] object SQLConf {
 
   val PARTITION_DISCOVERY_ENABLED = booleanConf("spark.sql.sources.partitionDiscovery.enabled",
     defaultValue = Some(true),
-    doc = "When true, automtically discover data partitions.")
+    doc = "When true, automatically discover data partitions.")
 
   val PARTITION_COLUMN_TYPE_INFERENCE =
     booleanConf("spark.sql.sources.partitionColumnTypeInference.enabled",
@@ -372,7 +381,7 @@ private[spark] object SQLConf {
   val PARTITION_MAX_FILES =
     intConf("spark.sql.sources.maxConcurrentWrites",
       defaultValue = Some(5),
-      doc = "The maximum number of concurent files to open before falling back on sorting when " +
+      doc = "The maximum number of concurrent files to open before falling back on sorting when " +
             "writing out files using dynamic partitioning.")
 
   // The output committer class used by HadoopFsRelation. The specified class needs to be a
@@ -471,6 +480,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
+  private[spark] def canonicalizeView: Boolean = getConf(CANONICALIZE_VIEW)
+
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
   private[spark] def codegenEnabled: Boolean = getConf(CODEGEN_ENABLED, getConf(TUNGSTEN_ENABLED))
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index ea1521a48c8a7..cf59bc0d590b0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.execution.{FileRelation, datasources}
 import org.apache.spark.sql.hive.client._
+import org.apache.spark.sql.hive.execution.HiveNativeCommand
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
@@ -588,6 +589,28 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       // Wait until children are resolved.
       case p: LogicalPlan if !p.childrenResolved => p
       case p: LogicalPlan if p.resolved => p
+
+      case CreateViewAsSelect(table, child, allowExisting, replace, sql) =>
+        if (conf.canonicalizeView) {
+          if (allowExisting && replace) {
+            throw new AnalysisException(
+              "It is not allowed to define a view with both IF NOT EXISTS and OR REPLACE.")
+          }
+
+          val (dbName, tblName) = processDatabaseAndTableName(
+            table.specifiedDatabase.getOrElse(client.currentDatabase), table.name)
+
+          execution.CreateViewAsSelect(
+            table.copy(
+              specifiedDatabase = Some(dbName),
+              name = tblName),
+            child.output,
+            allowExisting,
+            replace)
+        } else {
+          HiveNativeCommand(sql)
+        }
+
       case p @ CreateTableAsSelect(table, child, allowExisting) =>
         val schema = if (table.schema.nonEmpty) {
           table.schema
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 256440a9a2e97..2bf22f5449641 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -77,6 +77,16 @@ private[hive] case class CreateTableAsSelect(
     childrenResolved
 }
 
+private[hive] case class CreateViewAsSelect(
+    tableDesc: HiveTable,
+    child: LogicalPlan,
+    allowExisting: Boolean,
+    replace: Boolean,
+    sql: String) extends UnaryNode with Command {
+  override def output: Seq[Attribute] = Seq.empty[Attribute]
+  override lazy val resolved: Boolean = false
+}
+
 /** Provides a mapping from HiveQL statements to catalyst logical plans and expression trees. */
 private[hive] object HiveQl extends Logging {
   protected val nativeCommands = Seq(
@@ -99,7 +109,6 @@ private[hive] object HiveQl extends Logging {
     "TOK_ALTERTABLE_SKEWED",
     "TOK_ALTERTABLE_TOUCH",
     "TOK_ALTERTABLE_UNARCHIVE",
-    "TOK_ALTERVIEW",
     "TOK_ALTERVIEW_ADDPARTS",
     "TOK_ALTERVIEW_AS",
     "TOK_ALTERVIEW_DROPPARTS",
@@ -110,7 +119,6 @@ private[hive] object HiveQl extends Logging {
     "TOK_CREATEFUNCTION",
     "TOK_CREATEINDEX",
     "TOK_CREATEROLE",
-    "TOK_CREATEVIEW",
 
     "TOK_DESCDATABASE",
     "TOK_DESCFUNCTION",
@@ -254,12 +262,17 @@ private[hive] object HiveQl extends Logging {
      * Otherwise, there will be Null pointer exception,
      * when retrieving properties form HiveConf.
      */
-    val hContext = new Context(SessionState.get().getConf())
-    val node = ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql, hContext))
+    val hContext = createContext()
+    val node = getAst(sql, hContext)
     hContext.clear()
     node
   }
 
+  private def createContext(): Context = new Context(SessionState.get().getConf())
+
+  private def getAst(sql: String, context: Context) =
+    ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql, context))
+
   /**
    * Returns the HiveConf
    */
@@ -280,15 +293,18 @@ private[hive] object HiveQl extends Logging {
   /** Creates LogicalPlan for a given HiveQL string. */
   def createPlan(sql: String): LogicalPlan = {
     try {
-      val tree = getAst(sql)
-      if (nativeCommands contains tree.getText) {
+      val context = createContext()
+      val tree = getAst(sql, context)
+      val plan = if (nativeCommands contains tree.getText) {
         HiveNativeCommand(sql)
       } else {
-        nodeToPlan(tree) match {
+        nodeToPlan(tree, context) match {
           case NativePlaceholder => HiveNativeCommand(sql)
           case other => other
         }
       }
+      context.clear()
+      plan
     } catch {
       case pe: org.apache.hadoop.hive.ql.parse.ParseException =>
         pe.getMessage match {
@@ -342,7 +358,9 @@ private[hive] object HiveQl extends Logging {
     }
   }
 
-  protected def getClauses(clauseNames: Seq[String], nodeList: Seq[ASTNode]): Seq[Option[Node]] = {
+  protected def getClauses(
+      clauseNames: Seq[String],
+      nodeList: Seq[ASTNode]): Seq[Option[ASTNode]] = {
     var remainingNodes = nodeList
     val clauses = clauseNames.map { clauseName =>
       val (matches, nonMatches) = remainingNodes.partition(_.getText.toUpperCase == clauseName)
@@ -489,7 +507,43 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       }
   }
 
-  protected def nodeToPlan(node: Node): LogicalPlan = node match {
+  private def createView(
+      view: ASTNode,
+      context: Context,
+      viewNameParts: ASTNode,
+      query: ASTNode,
+      schema: Seq[HiveColumn],
+      properties: Map[String, String],
+      allowExist: Boolean,
+      replace: Boolean): CreateViewAsSelect = {
+    val (db, viewName) = extractDbNameTableName(viewNameParts)
+
+    val originalText = context.getTokenRewriteStream
+      .toString(query.getTokenStartIndex, query.getTokenStopIndex)
+
+    val tableDesc = HiveTable(
+      specifiedDatabase = db,
+      name = viewName,
+      schema = schema,
+      partitionColumns = Seq.empty[HiveColumn],
+      properties = properties,
+      serdeProperties = Map[String, String](),
+      tableType = VirtualView,
+      location = None,
+      inputFormat = None,
+      outputFormat = None,
+      serde = None,
+      viewText = Some(originalText))
+
+    // We need to keep the original SQL string so that if `spark.sql.canonicalizeView` is
+    // false, we can fall back to use hive native command later.
+    // We can remove this when parser is configurable(can access SQLConf) in the future.
+    val sql = context.getTokenRewriteStream
+      .toString(view.getTokenStartIndex, view.getTokenStopIndex)
+    CreateViewAsSelect(tableDesc, nodeToPlan(query, context), allowExist, replace, sql)
+  }
+
+  protected def nodeToPlan(node: ASTNode, context: Context): LogicalPlan = node match {
     // Special drop table that also uncaches.
     case Token("TOK_DROPTABLE",
            Token("TOK_TABNAME", tableNameParts) ::
@@ -521,14 +575,14 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       val Some(crtTbl) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_CREATETABLE", "FORMATTED", "EXTENDED"), explainArgs)
       ExplainCommand(
-        nodeToPlan(crtTbl),
+        nodeToPlan(crtTbl, context),
         extended = extended.isDefined)
     case Token("TOK_EXPLAIN", explainArgs) =>
       // Ignore FORMATTED if present.
       val Some(query) :: _ :: extended :: Nil =
         getClauses(Seq("TOK_QUERY", "FORMATTED", "EXTENDED"), explainArgs)
       ExplainCommand(
-        nodeToPlan(query),
+        nodeToPlan(query, context),
         extended = extended.isDefined)
 
     case Token("TOK_DESCTABLE", describeArgs) =>
@@ -563,6 +617,73 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         }
       }
 
+    case view @ Token("TOK_ALTERVIEW", children) =>
+      val Some(viewNameParts) :: maybeQuery :: ignores =
+        getClauses(Seq(
+          "TOK_TABNAME",
+          "TOK_QUERY",
+          "TOK_ALTERVIEW_ADDPARTS",
+          "TOK_ALTERVIEW_DROPPARTS",
+          "TOK_ALTERVIEW_PROPERTIES",
+          "TOK_ALTERVIEW_RENAME"), children)
+
+      // if ALTER VIEW doesn't have query part, let hive to handle it.
+      maybeQuery.map { query =>
+        createView(view, context, viewNameParts, query, Nil, Map(), false, true)
+      }.getOrElse(NativePlaceholder)
+
+    case view @ Token("TOK_CREATEVIEW", children)
+        if children.collect { case t @ Token("TOK_QUERY", _) => t }.nonEmpty =>
+      val Seq(
+        Some(viewNameParts),
+        Some(query),
+        maybeComment,
+        replace,
+        allowExisting,
+        maybeProperties,
+        maybeColumns,
+        maybePartCols
+      ) = getClauses(Seq(
+        "TOK_TABNAME",
+        "TOK_QUERY",
+        "TOK_TABLECOMMENT",
+        "TOK_ORREPLACE",
+        "TOK_IFNOTEXISTS",
+        "TOK_TABLEPROPERTIES",
+        "TOK_TABCOLNAME",
+        "TOK_VIEWPARTCOLS"), children)
+
+      // If the view is partitioned, we let hive handle it.
+      if (maybePartCols.isDefined) {
+        NativePlaceholder
+      } else {
+        val schema = maybeColumns.map { cols =>
+          BaseSemanticAnalyzer.getColumns(cols, true).asScala.map { field =>
+            // We can't specify column types when create view, so fill it with null first, and
+            // update it after the schema has been resolved later.
+            HiveColumn(field.getName, null, field.getComment)
+          }
+        }.getOrElse(Seq.empty[HiveColumn])
+
+        val properties = scala.collection.mutable.Map.empty[String, String]
+
+        maybeProperties.foreach {
+          case Token("TOK_TABLEPROPERTIES", list :: Nil) =>
+            properties ++= getProperties(list)
+        }
+
+        maybeComment.foreach {
+          case Token("TOK_TABLECOMMENT", child :: Nil) =>
+            val comment = BaseSemanticAnalyzer.unescapeSQLString(child.getText)
+            if (comment ne null) {
+              properties += ("comment" -> comment)
+            }
+        }
+
+        createView(view, context, viewNameParts, query, schema, properties.toMap,
+          allowExisting.isDefined, replace.isDefined)
+      }
+
     case Token("TOK_CREATETABLE", children)
         if children.collect { case t @ Token("TOK_QUERY", _) => t }.nonEmpty =>
       // Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
@@ -774,7 +895,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         case _ => // Unsupport features
       }
 
-      CreateTableAsSelect(tableDesc, nodeToPlan(query), allowExisting != None)
+      CreateTableAsSelect(tableDesc, nodeToPlan(query, context), allowExisting != None)
 
     // If its not a "CTAS" like above then take it as a native command
     case Token("TOK_CREATETABLE", _) => NativePlaceholder
@@ -793,7 +914,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             insertClauses.last match {
               case Token("TOK_CTE", cteClauses) =>
                 val cteRelations = cteClauses.map(node => {
-                  val relation = nodeToRelation(node).asInstanceOf[Subquery]
+                  val relation = nodeToRelation(node, context).asInstanceOf[Subquery]
                   (relation.alias, relation)
                 }).toMap
                 (Some(args.head), insertClauses.init, Some(cteRelations))
@@ -847,7 +968,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         }
 
         val relations = fromClause match {
-          case Some(f) => nodeToRelation(f)
+          case Some(f) => nodeToRelation(f, context)
           case None => OneRowRelation
         }
 
@@ -1094,7 +1215,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       cteRelations.map(With(query, _)).getOrElse(query)
 
     // HIVE-9039 renamed TOK_UNION => TOK_UNIONALL while adding TOK_UNIONDISTINCT
-    case Token("TOK_UNIONALL", left :: right :: Nil) => Union(nodeToPlan(left), nodeToPlan(right))
+    case Token("TOK_UNIONALL", left :: right :: Nil) =>
+      Union(nodeToPlan(left, context), nodeToPlan(right, context))
 
     case a: ASTNode =>
       throw new NotImplementedError(s"No parse rules for $node:\n ${dumpTree(a).toString} ")
@@ -1102,10 +1224,10 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
 
   val allJoinTokens = "(TOK_.*JOIN)".r
   val laterViewToken = "TOK_LATERAL_VIEW(.*)".r
-  def nodeToRelation(node: Node): LogicalPlan = node match {
+  def nodeToRelation(node: Node, context: Context): LogicalPlan = node match {
     case Token("TOK_SUBQUERY",
            query :: Token(alias, Nil) :: Nil) =>
-      Subquery(cleanIdentifier(alias), nodeToPlan(query))
+      Subquery(cleanIdentifier(alias), nodeToPlan(query, context))
 
     case Token(laterViewToken(isOuter), selectClause :: relationClause :: Nil) =>
       val Token("TOK_SELECT",
@@ -1121,7 +1243,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           outer = isOuter.nonEmpty,
           Some(alias.toLowerCase),
           attributes.map(UnresolvedAttribute(_)),
-          nodeToRelation(relationClause))
+          nodeToRelation(relationClause, context))
 
     /* All relations, possibly with aliases or sampling clauses. */
     case Token("TOK_TABREF", clauses) =>
@@ -1189,7 +1311,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         }.map(_._2)
 
       val isPreserved = tableOrdinals.map(i => (i - 1 < 0) || joinArgs(i - 1).getText == "PRESERVE")
-      val tables = tableOrdinals.map(i => nodeToRelation(joinArgs(i)))
+      val tables = tableOrdinals.map(i => nodeToRelation(joinArgs(i), context))
       val joinExpressions =
         tableOrdinals.map(i => joinArgs(i + 1).getChildren.asScala.map(nodeToExpr))
 
@@ -1244,8 +1366,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
         case "TOK_FULLOUTERJOIN" => FullOuter
         case "TOK_LEFTSEMIJOIN" => LeftSemi
       }
-      Join(nodeToRelation(relation1),
-        nodeToRelation(relation2),
+      Join(nodeToRelation(relation1, context),
+        nodeToRelation(relation2, context),
         joinType,
         other.headOption.map(nodeToExpr))
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 3811c152a7ae6..915eae9d21e23 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -19,13 +19,12 @@ package org.apache.spark.sql.hive.client
 
 import java.io.PrintStream
 import java.util.{Map => JMap}
+import javax.annotation.Nullable
 
 import org.apache.spark.sql.catalyst.analysis.{NoSuchDatabaseException, NoSuchTableException}
 import org.apache.spark.sql.catalyst.expressions.Expression
 
-private[hive] case class HiveDatabase(
-    name: String,
-    location: String)
+private[hive] case class HiveDatabase(name: String, location: String)
 
 private[hive] abstract class TableType { val name: String }
 private[hive] case object ExternalTable extends TableType { override val name = "EXTERNAL_TABLE" }
@@ -45,7 +44,7 @@ private[hive] case class HivePartition(
     values: Seq[String],
     storage: HiveStorageDescriptor)
 
-private[hive] case class HiveColumn(name: String, hiveType: String, comment: String)
+private[hive] case class HiveColumn(name: String, @Nullable hiveType: String, comment: String)
 private[hive] case class HiveTable(
     specifiedDatabase: Option[String],
     name: String,
@@ -126,6 +125,12 @@ private[hive] trait ClientInterface {
   /** Returns the metadata for the specified table or None if it doens't exist. */
   def getTableOption(dbName: String, tableName: String): Option[HiveTable]
 
+  /** Creates a view with the given metadata. */
+  def createView(view: HiveTable): Unit
+
+  /** Updates the given view with new metadata. */
+  def alertView(view: HiveTable): Unit
+
   /** Creates a table with the given metadata. */
   def createTable(table: HiveTable): Unit
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 4d1e3ed9198e6..8f6d448b2aef4 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -354,6 +354,37 @@ private[hive] class ClientWrapper(
     qlTable
   }
 
+  private def toViewTable(view: HiveTable): metadata.Table = {
+    // TODO: this is duplicated with `toQlTable` except the table type stuff.
+    val tbl = new metadata.Table(view.database, view.name)
+    tbl.setTableType(HTableType.VIRTUAL_VIEW)
+    tbl.setSerializationLib(null)
+    tbl.clearSerDeInfo()
+
+    // TODO: we will save the same SQL string to original and expanded text, which is different
+    // from Hive.
+    tbl.setViewOriginalText(view.viewText.get)
+    tbl.setViewExpandedText(view.viewText.get)
+
+    tbl.setFields(view.schema.map(c => new FieldSchema(c.name, c.hiveType, c.comment)).asJava)
+    view.properties.foreach { case (k, v) => tbl.setProperty(k, v) }
+
+    // set owner
+    tbl.setOwner(conf.getUser)
+    // set create time
+    tbl.setCreateTime((System.currentTimeMillis() / 1000).asInstanceOf[Int])
+
+    tbl
+  }
+
+  override def createView(view: HiveTable): Unit = withHiveState {
+    client.createTable(toViewTable(view))
+  }
+
+  override def alertView(view: HiveTable): Unit = withHiveState {
+    client.alterTable(view.qualifiedName, toViewTable(view))
+  }
+
   override def createTable(table: HiveTable): Unit = withHiveState {
     val qlTable = toQlTable(table)
     client.createTable(qlTable)
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
new file mode 100644
index 0000000000000..2b504ac974f07
--- /dev/null
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.hive.execution
+
+import org.apache.spark.sql.catalyst.expressions.Attribute
+import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext}
+import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
+import org.apache.spark.sql.execution.RunnableCommand
+import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}
+
+/**
+ * Create Hive view on non-hive-compatible tables by specifying schema ourselves instead of
+ * depending on Hive meta-store.
+ */
+// TODO: Note that this class can NOT canonicalize the view SQL string entirely, which is different
+// from Hive and may not work for some cases like create view on self join.
+private[hive] case class CreateViewAsSelect(
+    tableDesc: HiveTable,
+    childSchema: Seq[Attribute],
+    allowExisting: Boolean,
+    orReplace: Boolean) extends RunnableCommand {
+
+  assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length)
+  assert(tableDesc.viewText.isDefined)
+
+  override def run(sqlContext: SQLContext): Seq[Row] = {
+    val hiveContext = sqlContext.asInstanceOf[HiveContext]
+    val database = tableDesc.database
+    val viewName = tableDesc.name
+
+    if (hiveContext.catalog.tableExists(Seq(database, viewName))) {
+      if (allowExisting) {
+        // view already exists, will do nothing, to keep consistent with Hive
+      } else if (orReplace) {
+        hiveContext.catalog.client.alertView(prepareTable())
+      } else {
+        throw new AnalysisException(s"View $database.$viewName already exists. " +
+          "If you want to update the view definition, please use ALTER VIEW AS or " +
+          "CREATE OR REPLACE VIEW AS")
+      }
+    } else {
+      hiveContext.catalog.client.createView(prepareTable())
+    }
+
+    Seq.empty[Row]
+  }
+
+  private def prepareTable(): HiveTable = {
+    // setup column types according to the schema of child.
+    val schema = if (tableDesc.schema == Nil) {
+      childSchema.map { attr =>
+        HiveColumn(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), null)
+      }
+    } else {
+      childSchema.zip(tableDesc.schema).map { case (attr, col) =>
+        HiveColumn(col.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), col.comment)
+      }
+    }
+
+    val columnNames = childSchema.map(f => verbose(f.name))
+
+    // When user specified column names for view, we should create a project to do the renaming.
+    // When no column name specified, we still need to create a project to declare the columns
+    // we need, to make us more robust to top level `*`s.
+    val projectList = if (tableDesc.schema == Nil) {
+      columnNames.mkString(", ")
+    } else {
+      columnNames.zip(tableDesc.schema.map(f => verbose(f.name))).map {
+        case (name, alias) => s"$name AS $alias"
+      }.mkString(", ")
+    }
+
+    val viewName = verbose(tableDesc.name)
+
+    val expandedText = s"SELECT $projectList FROM (${tableDesc.viewText.get}) $viewName"
+
+    tableDesc.copy(schema = schema, viewText = Some(expandedText))
+  }
+
+  // escape backtick with double-backtick in column name and wrap it with backtick.
+  private def verbose(name: String) = s"`${name.replaceAll("`", "``")}`"
+}
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 8c3f9ac202637..ec5b83b98e401 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1248,4 +1248,121 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
         """.stripMargin), Row("b", 6.0) :: Row("a", 7.0) :: Nil)
     }
   }
+
+  test("correctly parse CREATE VIEW statement") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt") {
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt")
+        sql(
+          """CREATE VIEW IF NOT EXISTS
+            |default.testView (c1 COMMENT 'blabla', c2 COMMENT 'blabla')
+            |COMMENT 'blabla'
+            |TBLPROPERTIES ('a' = 'b')
+            |AS SELECT * FROM jt""".stripMargin)
+        checkAnswer(sql("SELECT c1, c2 FROM testView ORDER BY c1"), (1 to 9).map(i => Row(i, i)))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("correctly handle CREATE VIEW IF NOT EXISTS") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt", "jt2") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE VIEW testView AS SELECT id FROM jt")
+
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt2")
+        sql("CREATE VIEW IF NOT EXISTS testView AS SELECT * FROM jt2")
+
+        // make sure our view doesn't change.
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id"), (1 to 9).map(i => Row(i)))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("correctly handle CREATE OR REPLACE VIEW") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt", "jt2") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE OR REPLACE VIEW testView AS SELECT id FROM jt")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id"), (1 to 9).map(i => Row(i)))
+
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt2")
+        sql("CREATE OR REPLACE VIEW testView AS SELECT * FROM jt2")
+        // make sure the view has been changed.
+        checkAnswer(sql("SELECT * FROM testView ORDER BY i"), (1 to 9).map(i => Row(i, i)))
+
+        sql("DROP VIEW testView")
+
+        val e = intercept[AnalysisException] {
+          sql("CREATE OR REPLACE VIEW IF NOT EXISTS testView AS SELECT id FROM jt")
+        }
+        assert(e.message.contains("not allowed to define a view"))
+      }
+    }
+  }
+
+  test("correctly handle ALTER VIEW") {
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt", "jt2") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE VIEW testView AS SELECT id FROM jt")
+
+        val df = (1 until 10).map(i => i -> i).toDF("i", "j")
+        df.write.format("json").saveAsTable("jt2")
+        sql("ALTER VIEW testView AS SELECT * FROM jt2")
+        // make sure the view has been changed.
+        checkAnswer(sql("SELECT * FROM testView ORDER BY i"), (1 to 9).map(i => Row(i, i)))
+
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("create hive view for json table") {
+    // json table is not hive-compatible, make sure the new flag fix it.
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt") {
+        sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
+        sql("CREATE VIEW testView AS SELECT id FROM jt")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id"), (1 to 9).map(i => Row(i)))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("create hive view for partitioned parquet table") {
+    // partitioned parquet table is not hive-compatible, make sure the new flag fix it.
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("parTable") {
+        val df = Seq(1 -> "a").toDF("i", "j")
+        df.write.format("parquet").partitionBy("i").saveAsTable("parTable")
+        sql("CREATE VIEW testView AS SELECT i, j FROM parTable")
+        checkAnswer(sql("SELECT * FROM testView"), Row(1, "a"))
+        sql("DROP VIEW testView")
+      }
+    }
+  }
+
+  test("create hive view for joined tables") {
+    // make sure the new flag can handle some complex cases like join and schema change.
+    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+      withTable("jt1", "jt2") {
+        sqlContext.range(1, 10).toDF("id1").write.format("json").saveAsTable("jt1")
+        sqlContext.range(1, 10).toDF("id2").write.format("json").saveAsTable("jt2")
+        sql("CREATE VIEW testView AS SELECT * FROM jt1 JOIN jt2 ON id1 == id2")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id1"), (1 to 9).map(i => Row(i, i)))
+
+        val df = (1 until 10).map(i => i -> i).toDF("id1", "newCol")
+        df.write.format("json").mode(SaveMode.Overwrite).saveAsTable("jt1")
+        checkAnswer(sql("SELECT * FROM testView ORDER BY id1"), (1 to 9).map(i => Row(i, i)))
+
+        sql("DROP VIEW testView")
+      }
+    }
+  }
 }

From a8226a9f14e81c0b6712a30f1a60276200faebac Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 8 Oct 2015 13:49:10 -0700
Subject: [PATCH 002/139] Revert [SPARK-8654] [SQL] Fix Analysis exception when
 using NULL IN

This reverts commit dcbd58a929be0058b1cfa59b14898c4c428a7680 from #8983

Author: Michael Armbrust <michael@databricks.com>

Closes #9034 from marmbrus/revert8654.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 10 ++-------
 .../sql/catalyst/analysis/AnalysisSuite.scala | 21 -------------------
 2 files changed, 2 insertions(+), 29 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 7192c931d2e51..87a3845b2d9e5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -304,10 +304,7 @@ object HiveTypeCoercion {
   }
 
   /**
-   * Convert the value and in list expressions to the common operator type
-   * by looking at all the argument types and finding the closest one that
-   * all the arguments can be cast to. When no common operator type is found
-   * an Analysis Exception is raised.
+   * Convert all expressions in in() list to the left operator type
    */
   object InConversion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
@@ -315,10 +312,7 @@ object HiveTypeCoercion {
       case e if !e.childrenResolved => e
 
       case i @ In(a, b) if b.exists(_.dataType != a.dataType) =>
-        findWiderCommonType(i.children.map(_.dataType)) match {
-          case Some(finalDataType) => i.withNewChildren(i.children.map(Cast(_, finalDataType)))
-          case None => i
-        }
+        i.makeCopy(Array(a, b.map(Cast(_, a.dataType))))
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 77a4765e7751c..820b336aac759 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -135,25 +135,4 @@ class AnalysisSuite extends AnalysisTest {
     plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col"))
     checkAnalysis(plan, plan)
   }
-
-  test("SPARK-8654: invalid CAST in NULL IN(...) expression") {
-    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(2))), "a")() :: Nil,
-      LocalRelation()
-    )
-    assertAnalysisSuccess(plan)
-  }
-
-  test("SPARK-8654: different types in inlist but can be converted to a commmon type") {
-    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(1.2345))), "a")() :: Nil,
-      LocalRelation()
-    )
-    assertAnalysisSuccess(plan)
-  }
-
-  test("SPARK-8654: check type compatibility error") {
-    val plan = Project(Alias(In(Literal(null), Seq(Literal(true), Literal(1))), "a")() :: Nil,
-      LocalRelation()
-    )
-    assertAnalysisError(plan, Seq("data type mismatch: Arguments must be same type"))
-  }
 }

From 9e66a53c9955285a85c19f55c3ef62db2e1b868a Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Thu, 8 Oct 2015 14:28:14 -0700
Subject: [PATCH 003/139] [SPARK-10993] [SQL] Inital code generated encoder for
 product types

This PR is a first cut at code generating an encoder that takes a Scala `Product` type and converts it directly into the tungsten binary format.  This is done through the addition of a new set of expression that can be used to invoke methods on raw JVM objects, extracting fields and converting the result into the required format.  These can then be used directly in an `UnsafeProjection` allowing us to leverage the existing encoding logic.

According to some simple benchmarks, this can significantly speed up conversion (~4x).  However, replacing CatalystConverters is deferred to a later PR to keep this PR at a reasonable size.

```scala
case class SomeInts(a: Int, b: Int, c: Int, d: Int, e: Int)

val data = SomeInts(1, 2, 3, 4, 5)
val encoder = ProductEncoder[SomeInts]
val converter = CatalystTypeConverters.createToCatalystConverter(ScalaReflection.schemaFor[SomeInts].dataType)

(1 to 5).foreach {iter =>
  benchmark(s"converter $iter") {
    var i = 100000000
    while (i > 0) {
      val res = converter(data).asInstanceOf[InternalRow]
      assert(res.getInt(0) == 1)
      assert(res.getInt(1) == 2)
      i -= 1
    }
  }

  benchmark(s"encoder $iter") {
    var i = 100000000
    while (i > 0) {
      val res = encoder.toRow(data)
      assert(res.getInt(0) == 1)
      assert(res.getInt(1) == 2)
      i -= 1
    }
  }
}
```

Results:
```
[info] converter 1: 7170ms
[info] encoder 1: 1888ms
[info] converter 2: 6763ms
[info] encoder 2: 1824ms
[info] converter 3: 6912ms
[info] encoder 3: 1802ms
[info] converter 4: 7131ms
[info] encoder 4: 1798ms
[info] converter 5: 7350ms
[info] encoder 5: 1912ms
```

Author: Michael Armbrust <michael@databricks.com>

Closes #9019 from marmbrus/productEncoder.
---
 .../spark/sql/catalyst/ScalaReflection.scala  | 238 ++++++++++++-
 .../spark/sql/catalyst/encoders/Encoder.scala |  44 +++
 .../catalyst/encoders/ProductEncoder.scala    |  67 ++++
 .../expressions/codegen/CodeGenerator.scala   |   4 +-
 .../sql/catalyst/expressions/objects.scala    | 334 ++++++++++++++++++
 .../spark/sql/types/GenericArrayData.scala    |   9 +
 .../apache/spark/sql/types/ObjectType.scala   |  42 +++
 .../encoders/ProductEncoderSuite.scala        | 174 +++++++++
 8 files changed, 910 insertions(+), 2 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 2442341da106d..8b733f2a0b91f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
 import org.apache.spark.sql.catalyst.expressions._
@@ -75,6 +76,242 @@ trait ScalaReflection {
    */
   private def localTypeOf[T: TypeTag]: `Type` = typeTag[T].in(mirror).tpe
 
+  /**
+   * Returns the Spark SQL DataType for a given scala type.  Where this is not an exact mapping
+   * to a native type, an ObjectType is returned. Special handling is also used for Arrays including
+   * those that hold primitive types.
+   */
+  def dataTypeFor(tpe: `Type`): DataType = tpe match {
+    case t if t <:< definitions.IntTpe => IntegerType
+    case t if t <:< definitions.LongTpe => LongType
+    case t if t <:< definitions.DoubleTpe => DoubleType
+    case t if t <:< definitions.FloatTpe => FloatType
+    case t if t <:< definitions.ShortTpe => ShortType
+    case t if t <:< definitions.ByteTpe => ByteType
+    case t if t <:< definitions.BooleanTpe => BooleanType
+    case t if t <:< localTypeOf[Array[Byte]] => BinaryType
+    case _ =>
+      val className: String = tpe.erasure.typeSymbol.asClass.fullName
+      className match {
+        case "scala.Array" =>
+          val TypeRef(_, _, Seq(arrayType)) = tpe
+          val cls = arrayType match {
+            case t if t <:< definitions.IntTpe => classOf[Array[Int]]
+            case t if t <:< definitions.LongTpe => classOf[Array[Long]]
+            case t if t <:< definitions.DoubleTpe => classOf[Array[Double]]
+            case t if t <:< definitions.FloatTpe => classOf[Array[Float]]
+            case t if t <:< definitions.ShortTpe => classOf[Array[Short]]
+            case t if t <:< definitions.ByteTpe => classOf[Array[Byte]]
+            case t if t <:< definitions.BooleanTpe => classOf[Array[Boolean]]
+            case other =>
+              // There is probably a better way to do this, but I couldn't find it...
+              val elementType = dataTypeFor(other).asInstanceOf[ObjectType].cls
+              java.lang.reflect.Array.newInstance(elementType, 1).getClass
+
+          }
+          ObjectType(cls)
+        case other => ObjectType(Utils.classForName(className))
+      }
+  }
+
+  /** Returns expressions for extracting all the fields from the given type. */
+  def extractorsFor[T : TypeTag](inputObject: Expression): Seq[Expression] = {
+    ScalaReflectionLock.synchronized {
+      extractorFor(inputObject, typeTag[T].tpe).asInstanceOf[CreateStruct].children
+    }
+  }
+
+  /** Helper for extracting internal fields from a case class. */
+  protected def extractorFor(
+      inputObject: Expression,
+      tpe: `Type`): Expression = ScalaReflectionLock.synchronized {
+    if (!inputObject.dataType.isInstanceOf[ObjectType]) {
+      inputObject
+    } else {
+      tpe match {
+        case t if t <:< localTypeOf[Option[_]] =>
+          val TypeRef(_, _, Seq(optType)) = t
+          optType match {
+            // For primitive types we must manually unbox the value of the object.
+            case t if t <:< definitions.IntTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Integer]), inputObject),
+                "intValue",
+                IntegerType)
+            case t if t <:< definitions.LongTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Long]), inputObject),
+                "longValue",
+                LongType)
+            case t if t <:< definitions.DoubleTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Double]), inputObject),
+                "doubleValue",
+                DoubleType)
+            case t if t <:< definitions.FloatTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Float]), inputObject),
+                "floatValue",
+                FloatType)
+            case t if t <:< definitions.ShortTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Short]), inputObject),
+                "shortValue",
+                ShortType)
+            case t if t <:< definitions.ByteTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Byte]), inputObject),
+                "byteValue",
+                ByteType)
+            case t if t <:< definitions.BooleanTpe =>
+              Invoke(
+                UnwrapOption(ObjectType(classOf[java.lang.Boolean]), inputObject),
+                "booleanValue",
+                BooleanType)
+
+            // For non-primitives, we can just extract the object from the Option and then recurse.
+            case other =>
+              val className: String = optType.erasure.typeSymbol.asClass.fullName
+              val classObj = Utils.classForName(className)
+              val optionObjectType = ObjectType(classObj)
+
+              val unwrapped = UnwrapOption(optionObjectType, inputObject)
+              expressions.If(
+                IsNull(unwrapped),
+                expressions.Literal.create(null, schemaFor(optType).dataType),
+                extractorFor(unwrapped, optType))
+          }
+
+        case t if t <:< localTypeOf[Product] =>
+          val formalTypeArgs = t.typeSymbol.asClass.typeParams
+          val TypeRef(_, _, actualTypeArgs) = t
+          val constructorSymbol = t.member(nme.CONSTRUCTOR)
+          val params = if (constructorSymbol.isMethod) {
+            constructorSymbol.asMethod.paramss
+          } else {
+            // Find the primary constructor, and use its parameter ordering.
+            val primaryConstructorSymbol: Option[Symbol] =
+              constructorSymbol.asTerm.alternatives.find(s =>
+                s.isMethod && s.asMethod.isPrimaryConstructor)
+
+            if (primaryConstructorSymbol.isEmpty) {
+              sys.error("Internal SQL error: Product object did not have a primary constructor.")
+            } else {
+              primaryConstructorSymbol.get.asMethod.paramss
+            }
+          }
+
+          CreateStruct(params.head.map { p =>
+            val fieldName = p.name.toString
+            val fieldType = p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
+            val fieldValue = Invoke(inputObject, fieldName, dataTypeFor(fieldType))
+            extractorFor(fieldValue, fieldType)
+          })
+
+        case t if t <:< localTypeOf[Array[_]] =>
+          val TypeRef(_, _, Seq(elementType)) = t
+          val elementDataType = dataTypeFor(elementType)
+          val Schema(dataType, nullable) = schemaFor(elementType)
+
+          if (!elementDataType.isInstanceOf[AtomicType]) {
+            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
+          } else {
+            NewInstance(
+              classOf[GenericArrayData],
+              inputObject :: Nil,
+              dataType = ArrayType(dataType, nullable))
+          }
+
+        case t if t <:< localTypeOf[Seq[_]] =>
+          val TypeRef(_, _, Seq(elementType)) = t
+          val elementDataType = dataTypeFor(elementType)
+          val Schema(dataType, nullable) = schemaFor(elementType)
+
+          if (!elementDataType.isInstanceOf[AtomicType]) {
+            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
+          } else {
+            NewInstance(
+              classOf[GenericArrayData],
+              inputObject :: Nil,
+              dataType = ArrayType(dataType, nullable))
+          }
+
+        case t if t <:< localTypeOf[Map[_, _]] =>
+          val TypeRef(_, _, Seq(keyType, valueType)) = t
+          val Schema(keyDataType, _) = schemaFor(keyType)
+          val Schema(valueDataType, valueNullable) = schemaFor(valueType)
+
+          val rawMap = inputObject
+          val keys =
+            NewInstance(
+              classOf[GenericArrayData],
+              Invoke(rawMap, "keys", ObjectType(classOf[scala.collection.GenIterable[_]])) :: Nil,
+              dataType = ObjectType(classOf[ArrayData]))
+          val values =
+            NewInstance(
+              classOf[GenericArrayData],
+              Invoke(rawMap, "values", ObjectType(classOf[scala.collection.GenIterable[_]])) :: Nil,
+              dataType = ObjectType(classOf[ArrayData]))
+          NewInstance(
+            classOf[ArrayBasedMapData],
+            keys :: values :: Nil,
+            dataType = MapType(keyDataType, valueDataType, valueNullable))
+
+        case t if t <:< localTypeOf[String] =>
+          StaticInvoke(
+            classOf[UTF8String],
+            StringType,
+            "fromString",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.sql.Timestamp] =>
+          StaticInvoke(
+            DateTimeUtils,
+            TimestampType,
+            "fromJavaTimestamp",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.sql.Date] =>
+          StaticInvoke(
+            DateTimeUtils,
+            DateType,
+            "fromJavaDate",
+            inputObject :: Nil)
+        case t if t <:< localTypeOf[BigDecimal] =>
+          StaticInvoke(
+            Decimal,
+            DecimalType.SYSTEM_DEFAULT,
+            "apply",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.math.BigDecimal] =>
+          StaticInvoke(
+            Decimal,
+            DecimalType.SYSTEM_DEFAULT,
+            "apply",
+            inputObject :: Nil)
+
+        case t if t <:< localTypeOf[java.lang.Integer] =>
+          Invoke(inputObject, "intValue", IntegerType)
+        case t if t <:< localTypeOf[java.lang.Long] =>
+          Invoke(inputObject, "longValue", LongType)
+        case t if t <:< localTypeOf[java.lang.Double] =>
+          Invoke(inputObject, "doubleValue", DoubleType)
+        case t if t <:< localTypeOf[java.lang.Float] =>
+          Invoke(inputObject, "floatValue", FloatType)
+        case t if t <:< localTypeOf[java.lang.Short] =>
+          Invoke(inputObject, "shortValue", ShortType)
+        case t if t <:< localTypeOf[java.lang.Byte] =>
+          Invoke(inputObject, "byteValue", ByteType)
+        case t if t <:< localTypeOf[java.lang.Boolean] =>
+          Invoke(inputObject, "booleanValue", BooleanType)
+
+        case other =>
+          throw new UnsupportedOperationException(s"Extractor for type $other is not supported")
+      }
+    }
+  }
+
   /** Returns a catalyst DataType and its nullability for the given Scala Type using reflection. */
   def schemaFor(tpe: `Type`): Schema = ScalaReflectionLock.synchronized {
     val className: String = tpe.erasure.typeSymbol.asClass.fullName
@@ -91,7 +328,6 @@ trait ScalaReflection {
       case t if t <:< localTypeOf[Option[_]] =>
         val TypeRef(_, _, Seq(optType)) = t
         Schema(schemaFor(optType).dataType, nullable = true)
-      // Need to decide if we actually need a special type here.
       case t if t <:< localTypeOf[Array[Byte]] => Schema(BinaryType, nullable = true)
       case t if t <:< localTypeOf[Array[_]] =>
         val TypeRef(_, _, Seq(elementType)) = t
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
new file mode 100644
index 0000000000000..8dacfa9477ee6
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Used to convert a JVM object of type `T` to and from the internal Spark SQL representation.
+ *
+ * Encoders are not intended to be thread-safe and thus they are allow to avoid internal locking
+ * and reuse internal buffers to improve performance.
+ */
+trait Encoder[T] {
+  /** Returns the schema of encoding this type of object as a Row. */
+  def schema: StructType
+
+  /** A ClassTag that can be used to construct and Array to contain a collection of `T`. */
+  def clsTag: ClassTag[T]
+
+  /**
+   * Returns an encoded version of `t` as a Spark SQL row.  Note that multiple calls to
+   * toRow are allowed to return the same actual [[InternalRow]] object.  Thus, the caller should
+   * copy the result before making another call if required.
+   */
+  def toRow(t: T): InternalRow
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
new file mode 100644
index 0000000000000..a23613673ebb5
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+
+import scala.reflect.ClassTag
+import scala.reflect.runtime.universe.{typeTag, TypeTag}
+
+import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
+import org.apache.spark.sql.types.{ObjectType, StructType}
+
+/**
+ * A factory for constructing encoders that convert Scala's product type to/from the Spark SQL
+ * internal binary representation.
+ */
+object ProductEncoder {
+  def apply[T <: Product : TypeTag]: Encoder[T] = {
+    // We convert the not-serializable TypeTag into StructType and ClassTag.
+    val schema = ScalaReflection.schemaFor[T].dataType.asInstanceOf[StructType]
+    val mirror = typeTag[T].mirror
+    val cls = mirror.runtimeClass(typeTag[T].tpe)
+
+    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
+    val extractExpressions = ScalaReflection.extractorsFor[T](inputObject)
+    new ClassEncoder[T](schema, extractExpressions, ClassTag[T](cls))
+  }
+}
+
+/**
+ * A generic encoder for JVM objects.
+ *
+ * @param schema The schema after converting `T` to a Spark SQL row.
+ * @param extractExpressions A set of expressions, one for each top-level field that can be used to
+ *                           extract the values from a raw object.
+ * @param clsTag A classtag for `T`.
+ */
+case class ClassEncoder[T](
+    schema: StructType,
+    extractExpressions: Seq[Expression],
+    clsTag: ClassTag[T])
+  extends Encoder[T] {
+
+  private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
+  private val inputRow = new GenericMutableRow(1)
+
+  override def toRow(t: T): InternalRow = {
+    inputRow(0) = t
+    extractProjection(inputRow)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 2dd680454b4cf..a0fe5bd77e3aa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -177,6 +177,8 @@ class CodeGenContext {
     case _: MapType => "MapData"
     case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
     case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
+    case ObjectType(cls) if cls.isArray => s"${javaType(ObjectType(cls.getComponentType))}[]"
+    case ObjectType(cls) => cls.getName
     case _ => "Object"
   }
 
@@ -395,7 +397,7 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
 
     logDebug({
       // Only add extra debugging info to byte code when we are going to print the source code.
-      evaluator.setDebuggingInformation(false, true, false)
+      evaluator.setDebuggingInformation(true, true, false)
       withLineNums
     })
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
new file mode 100644
index 0000000000000..e1f960a6e605c
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -0,0 +1,334 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.expressions
+
+import scala.language.existentials
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
+import org.apache.spark.sql.types._
+
+/**
+ * Invokes a static function, returning the result.  By default, any of the arguments being null
+ * will result in returning null instead of calling the function.
+ *
+ * @param staticObject The target of the static call.  This can either be the object itself
+ *                     (methods defined on scala objects), or the class object
+ *                     (static methods defined in java).
+ * @param dataType The expected return type of the function call
+ * @param functionName The name of the method to call.
+ * @param arguments An optional list of expressions to pass as arguments to the function.
+ * @param propagateNull When true, and any of the arguments is null, null will be returned instead
+ *                      of calling the function.
+ */
+case class StaticInvoke(
+    staticObject: Any,
+    dataType: DataType,
+    functionName: String,
+    arguments: Seq[Expression] = Nil,
+    propagateNull: Boolean = true) extends Expression {
+
+  val objectName = staticObject match {
+    case c: Class[_] => c.getName
+    case other => other.getClass.getName.stripSuffix("$")
+  }
+  override def nullable: Boolean = true
+  override def children: Seq[Expression] = Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val argGen = arguments.map(_.gen(ctx))
+    val argString = argGen.map(_.value).mkString(", ")
+
+    if (propagateNull) {
+      val objNullCheck = if (ctx.defaultValue(dataType) == "null") {
+        s"${ev.isNull} = ${ev.value} == null;"
+      } else {
+        ""
+      }
+
+      val argsNonNull = s"!(${argGen.map(_.isNull).mkString(" || ")})"
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        boolean ${ev.isNull} = true;
+        $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
+
+        if ($argsNonNull) {
+          ${ev.value} = $objectName.$functionName($argString);
+          $objNullCheck
+        }
+       """
+    } else {
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        final boolean ${ev.isNull} = ${ev.value} == null;
+        $javaType ${ev.value} = $objectName.$functionName($argString);
+      """
+    }
+  }
+}
+
+/**
+ * Calls the specified function on an object, optionally passing arguments.  If the `targetObject`
+ * expression evaluates to null then null will be returned.
+ *
+ * @param targetObject An expression that will return the object to call the method on.
+ * @param functionName The name of the method to call.
+ * @param dataType The expected return type of the function.
+ * @param arguments An optional list of expressions, whos evaluation will be passed to the function.
+ */
+case class Invoke(
+    targetObject: Expression,
+    functionName: String,
+    dataType: DataType,
+    arguments: Seq[Expression] = Nil) extends Expression {
+
+  override def nullable: Boolean = true
+  override def children: Seq[Expression] = targetObject :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val obj = targetObject.gen(ctx)
+    val argGen = arguments.map(_.gen(ctx))
+    val argString = argGen.map(_.value).mkString(", ")
+
+    // If the function can return null, we do an extra check to make sure our null bit is still set
+    // correctly.
+    val objNullCheck = if (ctx.defaultValue(dataType) == "null") {
+      s"${ev.isNull} = ${ev.value} == null;"
+    } else {
+      ""
+    }
+
+    s"""
+      ${obj.code}
+      ${argGen.map(_.code).mkString("\n")}
+
+      boolean ${ev.isNull} = ${obj.value} == null;
+      $javaType ${ev.value} =
+        ${ev.isNull} ?
+        ${ctx.defaultValue(dataType)} : ($javaType) ${obj.value}.$functionName($argString);
+      $objNullCheck
+    """
+  }
+}
+
+/**
+ * Constructs a new instance of the given class, using the result of evaluating the specified
+ * expressions as arguments.
+ *
+ * @param cls The class to construct.
+ * @param arguments A list of expression to use as arguments to the constructor.
+ * @param propagateNull When true, if any of the arguments is null, then null will be returned
+ *                      instead of trying to construct the object.
+ * @param dataType The type of object being constructed, as a Spark SQL datatype.  This allows you
+ *                 to manually specify the type when the object in question is a valid internal
+ *                 representation (i.e. ArrayData) instead of an object.
+ */
+case class NewInstance(
+    cls: Class[_],
+    arguments: Seq[Expression],
+    propagateNull: Boolean = true,
+    dataType: DataType) extends Expression {
+  private val className = cls.getName
+
+  override def nullable: Boolean = propagateNull
+
+  override def children: Seq[Expression] = arguments
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val argGen = arguments.map(_.gen(ctx))
+    val argString = argGen.map(_.value).mkString(", ")
+
+    if (propagateNull) {
+      val objNullCheck = if (ctx.defaultValue(dataType) == "null") {
+        s"${ev.isNull} = ${ev.value} == null;"
+      } else {
+        ""
+      }
+
+      val argsNonNull = s"!(${argGen.map(_.isNull).mkString(" || ")})"
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        boolean ${ev.isNull} = true;
+        $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
+
+        if ($argsNonNull) {
+          ${ev.value} = new $className($argString);
+          ${ev.isNull} = false;
+        }
+       """
+    } else {
+      s"""
+        ${argGen.map(_.code).mkString("\n")}
+
+        final boolean ${ev.isNull} = ${ev.value} == null;
+        $javaType ${ev.value} = new $className($argString);
+      """
+    }
+  }
+}
+
+/**
+ * Given an expression that returns on object of type `Option[_]`, this expression unwraps the
+ * option into the specified Spark SQL datatype.  In the case of `None`, the nullbit is set instead.
+ *
+ * @param dataType The expected unwrapped option type.
+ * @param child An expression that returns an `Option`
+ */
+case class UnwrapOption(
+    dataType: DataType,
+    child: Expression) extends UnaryExpression with ExpectsInputTypes {
+
+  override def nullable: Boolean = true
+
+  override def children: Seq[Expression] = Nil
+
+  override def inputTypes: Seq[AbstractDataType] = ObjectType :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val inputObject = child.gen(ctx)
+
+    s"""
+      ${inputObject.code}
+
+      boolean ${ev.isNull} = ${inputObject.value} == null || ${inputObject.value}.isEmpty();
+      $javaType ${ev.value} =
+        ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($javaType)${inputObject.value}.get();
+    """
+  }
+}
+
+case class LambdaVariable(value: String, isNull: String, dataType: DataType) extends Expression {
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
+    throw new UnsupportedOperationException("Only calling gen() is supported.")
+
+  override def children: Seq[Expression] = Nil
+  override def gen(ctx: CodeGenContext): GeneratedExpressionCode =
+    GeneratedExpressionCode(code = "", value = value, isNull = isNull)
+
+  override def nullable: Boolean = false
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
+
+}
+
+/**
+ * Applies the given expression to every element of a collection of items, returning the result
+ * as an ArrayType.  This is similar to a typical map operation, but where the lambda function
+ * is expressed using catalyst expressions.
+ *
+ * The following collection ObjectTypes are currently supported: Seq, Array
+ *
+ * @param function A function that returns an expression, given an attribute that can be used
+ *                 to access the current value.  This is does as a lambda function so that
+ *                 a unique attribute reference can be provided for each expression (thus allowing
+ *                 us to nest multiple MapObject calls).
+ * @param inputData An expression that when evaluted returns a collection object.
+ * @param elementType The type of element in the collection, expressed as a DataType.
+ */
+case class MapObjects(
+    function: AttributeReference => Expression,
+    inputData: Expression,
+    elementType: DataType) extends Expression {
+
+  private val loopAttribute = AttributeReference("loopVar", elementType)()
+  private val completeFunction = function(loopAttribute)
+
+  private val (lengthFunction, itemAccessor) = inputData.dataType match {
+    case ObjectType(cls) if cls.isAssignableFrom(classOf[Seq[_]]) =>
+      (".size()", (i: String) => s".apply($i)")
+    case ObjectType(cls) if cls.isArray =>
+      (".length", (i: String) => s"[$i]")
+  }
+
+  override def nullable: Boolean = true
+
+  override def children: Seq[Expression] = completeFunction :: inputData :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def dataType: DataType = ArrayType(completeFunction.dataType)
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(dataType)
+    val elementJavaType = ctx.javaType(elementType)
+    val genInputData = inputData.gen(ctx)
+
+    // Variables to hold the element that is currently being processed.
+    val loopValue = ctx.freshName("loopValue")
+    val loopIsNull = ctx.freshName("loopIsNull")
+
+    val loopVariable = LambdaVariable(loopValue, loopIsNull, elementType)
+    val boundFunction = completeFunction transform {
+      case a: AttributeReference if a == loopAttribute => loopVariable
+    }
+
+    val genFunction = boundFunction.gen(ctx)
+    val dataLength = ctx.freshName("dataLength")
+    val convertedArray = ctx.freshName("convertedArray")
+    val loopIndex = ctx.freshName("loopIndex")
+
+    s"""
+      ${genInputData.code}
+
+      boolean ${ev.isNull} = ${genInputData.value} == null;
+      $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
+
+      if (!${ev.isNull}) {
+        Object[] $convertedArray = null;
+        int $dataLength = ${genInputData.value}$lengthFunction;
+        $convertedArray = new Object[$dataLength];
+
+        int $loopIndex = 0;
+        while ($loopIndex < $dataLength) {
+          $elementJavaType $loopValue =
+            ($elementJavaType)${genInputData.value}${itemAccessor(loopIndex)};
+          boolean $loopIsNull = $loopValue == null;
+
+          ${genFunction.code}
+
+          $convertedArray[$loopIndex] = ${genFunction.value};
+          $loopIndex += 1;
+        }
+
+        ${ev.isNull} = false;
+        ${ev.value} = new ${classOf[GenericArrayData].getName}($convertedArray);
+      }
+    """
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
index 459fcb6fc0acc..c3816033275d5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
@@ -22,6 +22,15 @@ import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
 class GenericArrayData(private[sql] val array: Array[Any]) extends ArrayData {
 
+  def this(seq: scala.collection.GenIterable[Any]) = this(seq.toArray)
+
+  // TODO: This is boxing.  We should specialize.
+  def this(primitiveArray: Array[Int]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Long]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Float]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Double]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq)
+
   override def copy(): ArrayData = new GenericArrayData(array.clone())
 
   override def numElements(): Int = array.length
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
new file mode 100644
index 0000000000000..fca0b799eb809
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ObjectType.scala
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.types
+
+import scala.language.existentials
+
+private[sql] object ObjectType extends AbstractDataType {
+  override private[sql] def defaultConcreteType: DataType =
+    throw new UnsupportedOperationException("null literals can't be casted to ObjectType")
+
+  // No casting or comparison is supported.
+  override private[sql] def acceptsType(other: DataType): Boolean = false
+
+  override private[sql] def simpleString: String = "Object"
+}
+
+/**
+ * Represents a JVM object that is passing through Spark SQL expression evaluation.  Note this
+ * is only used internally while converting into the internal format and is not intended for use
+ * outside of the execution engine.
+ */
+private[sql] case class ObjectType(cls: Class[_]) extends DataType {
+  override def defaultSize: Int =
+    throw new UnsupportedOperationException("No size estimation available for objects.")
+
+  def asNullable: DataType = this
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
new file mode 100644
index 0000000000000..99c993d3febc2
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import java.sql.{Date, Timestamp}
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.ScalaReflection._
+import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
+import org.apache.spark.sql.catalyst._
+
+
+case class RepeatedStruct(s: Seq[PrimitiveData])
+
+case class NestedArray(a: Array[Array[Int]])
+
+class ProductEncoderSuite extends SparkFunSuite {
+
+  test("convert PrimitiveData to InternalRow") {
+    val inputData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
+    val encoder = ProductEncoder[PrimitiveData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.getInt(0) == 1)
+    assert(convertedData.getLong(1) == 1.toLong)
+    assert(convertedData.getDouble(2) == 1.toDouble)
+    assert(convertedData.getFloat(3) == 1.toFloat)
+    assert(convertedData.getShort(4) == 1.toShort)
+    assert(convertedData.getByte(5) == 1.toByte)
+    assert(convertedData.getBoolean(6) == true)
+  }
+
+  test("convert Some[_] to InternalRow") {
+    val primitiveData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
+    val inputData = OptionalData(Some(2), Some(2), Some(2), Some(2), Some(2), Some(2), Some(true),
+      Some(primitiveData))
+
+    val encoder = ProductEncoder[OptionalData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.getInt(0) == 2)
+    assert(convertedData.getLong(1) == 2.toLong)
+    assert(convertedData.getDouble(2) == 2.toDouble)
+    assert(convertedData.getFloat(3) == 2.toFloat)
+    assert(convertedData.getShort(4) == 2.toShort)
+    assert(convertedData.getByte(5) == 2.toByte)
+    assert(convertedData.getBoolean(6) == true)
+
+    val nestedRow = convertedData.getStruct(7, 7)
+    assert(nestedRow.getInt(0) == 1)
+    assert(nestedRow.getLong(1) == 1.toLong)
+    assert(nestedRow.getDouble(2) == 1.toDouble)
+    assert(nestedRow.getFloat(3) == 1.toFloat)
+    assert(nestedRow.getShort(4) == 1.toShort)
+    assert(nestedRow.getByte(5) == 1.toByte)
+    assert(nestedRow.getBoolean(6) == true)
+  }
+
+  test("convert None to InternalRow") {
+    val inputData = OptionalData(None, None, None, None, None, None, None, None)
+    val encoder = ProductEncoder[OptionalData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.isNullAt(0))
+    assert(convertedData.isNullAt(1))
+    assert(convertedData.isNullAt(2))
+    assert(convertedData.isNullAt(3))
+    assert(convertedData.isNullAt(4))
+    assert(convertedData.isNullAt(5))
+    assert(convertedData.isNullAt(6))
+    assert(convertedData.isNullAt(7))
+  }
+
+  test("convert nullable but present data to InternalRow") {
+    val inputData = NullableData(
+      1, 1L, 1.0, 1.0f, 1.toShort, 1.toByte, true, "test", new java.math.BigDecimal(1), new Date(0),
+      new Timestamp(0), Array[Byte](1, 2, 3))
+
+    val encoder = ProductEncoder[NullableData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.getInt(0) == 1)
+    assert(convertedData.getLong(1) == 1.toLong)
+    assert(convertedData.getDouble(2) == 1.toDouble)
+    assert(convertedData.getFloat(3) == 1.toFloat)
+    assert(convertedData.getShort(4) == 1.toShort)
+    assert(convertedData.getByte(5) == 1.toByte)
+    assert(convertedData.getBoolean(6) == true)
+  }
+
+  test("convert nullable data to InternalRow") {
+    val inputData =
+      NullableData(null, null, null, null, null, null, null, null, null, null, null, null)
+
+    val encoder = ProductEncoder[NullableData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(convertedData.isNullAt(0))
+    assert(convertedData.isNullAt(1))
+    assert(convertedData.isNullAt(2))
+    assert(convertedData.isNullAt(3))
+    assert(convertedData.isNullAt(4))
+    assert(convertedData.isNullAt(5))
+    assert(convertedData.isNullAt(6))
+    assert(convertedData.isNullAt(7))
+    assert(convertedData.isNullAt(8))
+    assert(convertedData.isNullAt(9))
+    assert(convertedData.isNullAt(10))
+    assert(convertedData.isNullAt(11))
+  }
+
+  test("convert repeated struct") {
+    val inputData = RepeatedStruct(PrimitiveData(1, 1, 1, 1, 1, 1, true) :: Nil)
+    val encoder = ProductEncoder[RepeatedStruct]
+
+    val converted = encoder.toRow(inputData)
+    val convertedStruct = converted.getArray(0).getStruct(0, 7)
+    assert(convertedStruct.getInt(0) == 1)
+    assert(convertedStruct.getLong(1) == 1.toLong)
+    assert(convertedStruct.getDouble(2) == 1.toDouble)
+    assert(convertedStruct.getFloat(3) == 1.toFloat)
+    assert(convertedStruct.getShort(4) == 1.toShort)
+    assert(convertedStruct.getByte(5) == 1.toByte)
+    assert(convertedStruct.getBoolean(6) == true)
+  }
+
+  test("convert nested seq") {
+    val convertedData = ProductEncoder[Tuple1[Seq[Seq[Int]]]].toRow(Tuple1(Seq(Seq(1))))
+    assert(convertedData.getArray(0).getArray(0).getInt(0) == 1)
+
+    val convertedData2 = ProductEncoder[Tuple1[Seq[Seq[Seq[Int]]]]].toRow(Tuple1(Seq(Seq(Seq(1)))))
+    assert(convertedData2.getArray(0).getArray(0).getArray(0).getInt(0) == 1)
+  }
+
+  test("convert nested array") {
+    val convertedData = ProductEncoder[Tuple1[Array[Array[Int]]]].toRow(Tuple1(Array(Array(1))))
+  }
+
+  test("convert complex") {
+    val inputData = ComplexData(
+      Seq(1, 2),
+      Array(1, 2),
+      1 :: 2 :: Nil,
+      Seq(new Integer(1), null, new Integer(2)),
+      Map(1 -> 2L),
+      Map(1 -> new java.lang.Long(2)),
+      PrimitiveData(1, 1, 1, 1, 1, 1, true),
+      Array(Array(1)))
+
+    val encoder = ProductEncoder[ComplexData]
+    val convertedData = encoder.toRow(inputData)
+
+    assert(!convertedData.isNullAt(0))
+    val seq = convertedData.getArray(0)
+    assert(seq.numElements() == 2)
+    assert(seq.getInt(0) == 1)
+    assert(seq.getInt(1) == 2)
+  }
+}

From 2816c89b6a304cb0b5214e14ebbc320158e88260 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 8 Oct 2015 14:53:21 -0700
Subject: [PATCH 004/139] [SPARK-10988] [SQL] Reduce duplication in
 Aggregate2's expression rewriting logic

In `aggregate/utils.scala`, there is a substantial amount of duplication in the expression-rewriting logic. As a prerequisite to supporting imperative aggregate functions in `TungstenAggregate`, this patch refactors this file so that the same expression-rewriting logic is used for both `SortAggregate` and `TungstenAggregate`.

In order to allow both operators to use the same rewriting logic, `TungstenAggregationIterator. generateResultProjection()` has been updated so that it first evaluates all declarative aggregate functions' `evaluateExpression`s and writes the results into a temporary buffer, and then uses this temporary buffer and the grouping expressions to evaluate the final resultExpressions. This matches the logic in SortAggregateIterator, where this two-pass approach is necessary in order to support imperative aggregates. If this change turns out to cause performance regressions, then we can look into re-implementing the single-pass evaluation in a cleaner way as part of a followup patch.

Since the rewriting logic is now shared across both operators, this patch also extracts that logic and places it in `SparkStrategies`. This makes the rewriting logic a bit easier to follow, I think.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9015 from JoshRosen/SPARK-10988.
---
 .../spark/sql/execution/SparkStrategies.scala |  67 +++--
 .../aggregate/TungstenAggregate.scala         |   4 +
 .../TungstenAggregationIterator.scala         |  22 +-
 .../spark/sql/execution/aggregate/utils.scala | 244 +++++-------------
 .../TungstenAggregationIteratorSuite.scala    |   2 +-
 5 files changed, 143 insertions(+), 196 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
index d1bbf2e20fcf4..79bd1a41808de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -195,19 +195,22 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
         converted match {
           case None => Nil // Cannot convert to new aggregation code path.
           case Some(logical.Aggregate(groupingExpressions, resultExpressions, child)) =>
-            // Extracts all distinct aggregate expressions from the resultExpressions.
+            // A single aggregate expression might appear multiple times in resultExpressions.
+            // In order to avoid evaluating an individual aggregate function multiple times, we'll
+            // build a set of the distinct aggregate expressions and build a function which can
+            // be used to re-write expressions so that they reference the single copy of the
+            // aggregate function which actually gets computed.
             val aggregateExpressions = resultExpressions.flatMap { expr =>
               expr.collect {
                 case agg: AggregateExpression2 => agg
               }
-            }.toSet.toSeq
+            }.distinct
             // For those distinct aggregate expressions, we create a map from the
             // aggregate function to the corresponding attribute of the function.
-            val aggregateFunctionMap = aggregateExpressions.map { agg =>
+            val aggregateFunctionToAttribute = aggregateExpressions.map { agg =>
               val aggregateFunction = agg.aggregateFunction
-              val attribtue = Alias(aggregateFunction, aggregateFunction.toString)().toAttribute
-              (aggregateFunction, agg.isDistinct) ->
-                (aggregateFunction -> attribtue)
+              val attribute = Alias(aggregateFunction, aggregateFunction.toString)().toAttribute
+              (aggregateFunction, agg.isDistinct) -> attribute
             }.toMap
 
             val (functionsWithDistinct, functionsWithoutDistinct) =
@@ -220,6 +223,40 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
                   "code path.")
             }
 
+            val namedGroupingExpressions = groupingExpressions.map {
+              case ne: NamedExpression => ne -> ne
+              // If the expression is not a NamedExpressions, we add an alias.
+              // So, when we generate the result of the operator, the Aggregate Operator
+              // can directly get the Seq of attributes representing the grouping expressions.
+              case other =>
+                val withAlias = Alias(other, other.toString)()
+                other -> withAlias
+            }
+            val groupExpressionMap = namedGroupingExpressions.toMap
+
+            // The original `resultExpressions` are a set of expressions which may reference
+            // aggregate expressions, grouping column values, and constants. When aggregate operator
+            // emits output rows, we will use `resultExpressions` to generate an output projection
+            // which takes the grouping columns and final aggregate result buffer as input.
+            // Thus, we must re-write the result expressions so that their attributes match up with
+            // the attributes of the final result projection's input row:
+            val rewrittenResultExpressions = resultExpressions.map { expr =>
+              expr.transformDown {
+                case AggregateExpression2(aggregateFunction, _, isDistinct) =>
+                  // The final aggregation buffer's attributes will be `finalAggregationAttributes`,
+                  // so replace each aggregate expression by its corresponding attribute in the set:
+                  aggregateFunctionToAttribute(aggregateFunction, isDistinct)
+                case expression =>
+                  // Since we're using `namedGroupingAttributes` to extract the grouping key
+                  // columns, we need to replace grouping key expressions with their corresponding
+                  // attributes. We do not rely on the equality check at here since attributes may
+                  // differ cosmetically. Instead, we use semanticEquals.
+                  groupExpressionMap.collectFirst {
+                    case (expr, ne) if expr semanticEquals expression => ne.toAttribute
+                  }.getOrElse(expression)
+              }.asInstanceOf[NamedExpression]
+            }
+
             val aggregateOperator =
               if (aggregateExpressions.map(_.aggregateFunction).exists(!_.supportsPartial)) {
                 if (functionsWithDistinct.nonEmpty) {
@@ -227,26 +264,26 @@ private[sql] abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
                     "aggregate functions which don't support partial aggregation.")
                 } else {
                   aggregate.Utils.planAggregateWithoutPartial(
-                    groupingExpressions,
+                    namedGroupingExpressions.map(_._2),
                     aggregateExpressions,
-                    aggregateFunctionMap,
-                    resultExpressions,
+                    aggregateFunctionToAttribute,
+                    rewrittenResultExpressions,
                     planLater(child))
                 }
               } else if (functionsWithDistinct.isEmpty) {
                 aggregate.Utils.planAggregateWithoutDistinct(
-                  groupingExpressions,
+                  namedGroupingExpressions.map(_._2),
                   aggregateExpressions,
-                  aggregateFunctionMap,
-                  resultExpressions,
+                  aggregateFunctionToAttribute,
+                  rewrittenResultExpressions,
                   planLater(child))
               } else {
                 aggregate.Utils.planAggregateWithOneDistinct(
-                  groupingExpressions,
+                  namedGroupingExpressions.map(_._2),
                   functionsWithDistinct,
                   functionsWithoutDistinct,
-                  aggregateFunctionMap,
-                  resultExpressions,
+                  aggregateFunctionToAttribute,
+                  rewrittenResultExpressions,
                   planLater(child))
               }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index 3cd22af30592c..7b3d072b2e067 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -31,7 +31,9 @@ case class TungstenAggregate(
     requiredChildDistributionExpressions: Option[Seq[Expression]],
     groupingExpressions: Seq[NamedExpression],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
+    nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
+    completeAggregateAttributes: Seq[Attribute],
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
   extends UnaryNode {
@@ -77,7 +79,9 @@ case class TungstenAggregate(
       new TungstenAggregationIterator(
         groupingExpressions,
         nonCompleteAggregateExpressions,
+        nonCompleteAggregateAttributes,
         completeAggregateExpressions,
+        completeAggregateAttributes,
         resultExpressions,
         newMutableProjection,
         child.output,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index a6f4c1d92f6dc..4bb95c9eb7f3e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -60,8 +60,12 @@ import org.apache.spark.sql.types.StructType
  * @param nonCompleteAggregateExpressions
  *   [[AggregateExpression2]] containing [[AggregateFunction2]]s with mode [[Partial]],
  *   [[PartialMerge]], or [[Final]].
+ * @param nonCompleteAggregateAttributes the attributes of the nonCompleteAggregateExpressions'
+ *   outputs when they are stored in the final aggregation buffer.
  * @param completeAggregateExpressions
  *   [[AggregateExpression2]] containing [[AggregateFunction2]]s with mode [[Complete]].
+ * @param completeAggregateAttributes the attributes of completeAggregateExpressions' outputs
+ *   when they are stored in the final aggregation buffer.
  * @param resultExpressions
  *   expressions for generating output rows.
  * @param newMutableProjection
@@ -72,7 +76,9 @@ import org.apache.spark.sql.types.StructType
 class TungstenAggregationIterator(
     groupingExpressions: Seq[NamedExpression],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
+    nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
+    completeAggregateAttributes: Seq[Attribute],
     resultExpressions: Seq[NamedExpression],
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
     originalInputAttributes: Seq[Attribute],
@@ -280,17 +286,25 @@ class TungstenAggregationIterator(
       // resultExpressions.
       case (Some(Final), None) | (Some(Final) | None, Some(Complete)) =>
         val joinedRow = new JoinedRow()
+        val evalExpressions = allAggregateFunctions.map {
+          case ae: DeclarativeAggregate => ae.evaluateExpression
+          // case agg: AggregateFunction2 => Literal.create(null, agg.dataType)
+        }
+        val expressionAggEvalProjection = UnsafeProjection.create(evalExpressions, bufferAttributes)
+        // These are the attributes of the row produced by `expressionAggEvalProjection`
+        val aggregateResultSchema = nonCompleteAggregateAttributes ++ completeAggregateAttributes
         val resultProjection =
-          UnsafeProjection.create(resultExpressions, groupingAttributes ++ bufferAttributes)
+          UnsafeProjection.create(resultExpressions, groupingAttributes ++ aggregateResultSchema)
 
         (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => {
-          resultProjection(joinedRow(currentGroupingKey, currentBuffer))
+          // Generate results for all expression-based aggregate functions.
+          val aggregateResult = expressionAggEvalProjection.apply(currentBuffer)
+          resultProjection(joinedRow(currentGroupingKey, aggregateResult))
         }
 
       // Grouping-only: a output row is generated from values of grouping expressions.
       case (None, None) =>
-        val resultProjection =
-          UnsafeProjection.create(resultExpressions, groupingAttributes)
+        val resultProjection = UnsafeProjection.create(resultExpressions, groupingAttributes)
 
         (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => {
           resultProjection(currentGroupingKey)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index e1c2d9475a10f..cf6e7ed0d337f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.execution.aggregate
 
-import scala.collection.mutable
-
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan}
@@ -38,60 +36,35 @@ object Utils {
   }
 
   def planAggregateWithoutPartial(
-      groupingExpressions: Seq[Expression],
+      groupingExpressions: Seq[NamedExpression],
       aggregateExpressions: Seq[AggregateExpression2],
-      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      aggregateFunctionToAttribute: Map[(AggregateFunction2, Boolean), Attribute],
       resultExpressions: Seq[NamedExpression],
       child: SparkPlan): Seq[SparkPlan] = {
 
-    val namedGroupingExpressions = groupingExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      // If the expression is not a NamedExpressions, we add an alias.
-      // So, when we generate the result of the operator, the Aggregate Operator
-      // can directly get the Seq of attributes representing the grouping expressions.
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
-    }
-    val groupExpressionMap = namedGroupingExpressions.toMap
-    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
-
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
     val completeAggregateExpressions = aggregateExpressions.map(_.copy(mode = Complete))
-    val completeAggregateAttributes =
-      completeAggregateExpressions.map {
-        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
-      }
-
-    val rewrittenResultExpressions = resultExpressions.map { expr =>
-      expr.transformDown {
-        case agg: AggregateExpression2 =>
-          aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
-        case expression =>
-          // We do not rely on the equality check at here since attributes may
-          // different cosmetically. Instead, we use semanticEquals.
-          groupExpressionMap.collectFirst {
-            case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-          }.getOrElse(expression)
-      }.asInstanceOf[NamedExpression]
+    val completeAggregateAttributes = completeAggregateExpressions.map {
+      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
     }
 
     SortBasedAggregate(
-      requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-      groupingExpressions = namedGroupingExpressions.map(_._2),
+      requiredChildDistributionExpressions = Some(groupingAttributes),
+      groupingExpressions = groupingAttributes,
       nonCompleteAggregateExpressions = Nil,
       nonCompleteAggregateAttributes = Nil,
       completeAggregateExpressions = completeAggregateExpressions,
       completeAggregateAttributes = completeAggregateAttributes,
       initialInputBufferOffset = 0,
-      resultExpressions = rewrittenResultExpressions,
+      resultExpressions = resultExpressions,
       child = child
     ) :: Nil
   }
 
   def planAggregateWithoutDistinct(
-      groupingExpressions: Seq[Expression],
+      groupingExpressions: Seq[NamedExpression],
       aggregateExpressions: Seq[AggregateExpression2],
-      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      aggregateFunctionToAttribute: Map[(AggregateFunction2, Boolean), Attribute],
       resultExpressions: Seq[NamedExpression],
       child: SparkPlan): Seq[SparkPlan] = {
     // Check if we can use TungstenAggregate.
@@ -104,36 +77,29 @@ object Utils {
 
 
     // 1. Create an Aggregate Operator for partial aggregations.
-    val namedGroupingExpressions = groupingExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      // If the expression is not a NamedExpressions, we add an alias.
-      // So, when we generate the result of the operator, the Aggregate Operator
-      // can directly get the Seq of attributes representing the grouping expressions.
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
-    }
-    val groupExpressionMap = namedGroupingExpressions.toMap
-    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
     val partialAggregateExpressions = aggregateExpressions.map(_.copy(mode = Partial))
     val partialAggregateAttributes =
       partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialResultExpressions =
-      namedGroupingAttributes ++
+      groupingAttributes ++
         partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
 
     val partialAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
         requiredChildDistributionExpressions = None: Option[Seq[Expression]],
-        groupingExpressions = namedGroupingExpressions.map(_._2),
+        groupingExpressions = groupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
+        nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
+        completeAggregateAttributes = Nil,
         resultExpressions = partialResultExpressions,
         child = child)
     } else {
       SortBasedAggregate(
         requiredChildDistributionExpressions = None: Option[Seq[Expression]],
-        groupingExpressions = namedGroupingExpressions.map(_._2),
+        groupingExpressions = groupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
         nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
@@ -145,58 +111,32 @@ object Utils {
 
     // 2. Create an Aggregate Operator for final aggregations.
     val finalAggregateExpressions = aggregateExpressions.map(_.copy(mode = Final))
-    val finalAggregateAttributes =
-      finalAggregateExpressions.map {
-        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
-      }
+    // The attributes of the final aggregation buffer, which is presented as input to the result
+    // projection:
+    val finalAggregateAttributes = finalAggregateExpressions.map {
+      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
+    }
 
     val finalAggregate = if (usesTungstenAggregate) {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transformDown {
-          case agg: AggregateExpression2 =>
-            // aggregateFunctionMap contains unique aggregate functions.
-            val aggregateFunction =
-              aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._1
-            aggregateFunction.asInstanceOf[DeclarativeAggregate].evaluateExpression
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
-
       TungstenAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
+        nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = Nil,
-        resultExpressions = rewrittenResultExpressions,
+        completeAggregateAttributes = Nil,
+        resultExpressions = resultExpressions,
         child = partialAggregate)
     } else {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transformDown {
-          case agg: AggregateExpression2 =>
-            aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
-
       SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
         nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
-        initialInputBufferOffset = namedGroupingAttributes.length,
-        resultExpressions = rewrittenResultExpressions,
+        initialInputBufferOffset = groupingExpressions.length,
+        resultExpressions = resultExpressions,
         child = partialAggregate)
     }
 
@@ -204,10 +144,10 @@ object Utils {
   }
 
   def planAggregateWithOneDistinct(
-      groupingExpressions: Seq[Expression],
+      groupingExpressions: Seq[NamedExpression],
       functionsWithDistinct: Seq[AggregateExpression2],
       functionsWithoutDistinct: Seq[AggregateExpression2],
-      aggregateFunctionMap: Map[(AggregateFunction2, Boolean), (AggregateFunction2, Attribute)],
+      aggregateFunctionToAttribute: Map[(AggregateFunction2, Boolean), Attribute],
       resultExpressions: Seq[NamedExpression],
       child: SparkPlan): Seq[SparkPlan] = {
 
@@ -221,20 +161,7 @@ object Utils {
           aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
     // 1. Create an Aggregate Operator for partial aggregations.
-    // The grouping expressions are original groupingExpressions and
-    // distinct columns. For example, for avg(distinct value) ... group by key
-    // the grouping expressions of this Aggregate Operator will be [key, value].
-    val namedGroupingExpressions = groupingExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      // If the expression is not a NamedExpressions, we add an alias.
-      // So, when we generate the result of the operator, the Aggregate Operator
-      // can directly get the Seq of attributes representing the grouping expressions.
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
-    }
-    val groupExpressionMap = namedGroupingExpressions.toMap
-    val namedGroupingAttributes = namedGroupingExpressions.map(_._2.toAttribute)
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
 
     // It is safe to call head at here since functionsWithDistinct has at least one
     // AggregateExpression2.
@@ -253,22 +180,27 @@ object Utils {
     val partialAggregateAttributes =
       partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialAggregateGroupingExpressions =
-      (namedGroupingExpressions ++ namedDistinctColumnExpressions).map(_._2)
+      groupingExpressions ++ namedDistinctColumnExpressions.map(_._2)
     val partialAggregateResult =
-      namedGroupingAttributes ++
+        groupingAttributes ++
         distinctColumnAttributes ++
         partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
     val partialAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
-        requiredChildDistributionExpressions = None: Option[Seq[Expression]],
+        requiredChildDistributionExpressions = None,
+        // The grouping expressions are original groupingExpressions and
+        // distinct columns. For example, for avg(distinct value) ... group by key
+        // the grouping expressions of this Aggregate Operator will be [key, value].
         groupingExpressions = partialAggregateGroupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
+        nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
+        completeAggregateAttributes = Nil,
         resultExpressions = partialAggregateResult,
         child = child)
     } else {
       SortBasedAggregate(
-        requiredChildDistributionExpressions = None: Option[Seq[Expression]],
+        requiredChildDistributionExpressions = None,
         groupingExpressions = partialAggregateGroupingExpressions,
         nonCompleteAggregateExpressions = partialAggregateExpressions,
         nonCompleteAggregateAttributes = partialAggregateAttributes,
@@ -284,41 +216,40 @@ object Utils {
     val partialMergeAggregateAttributes =
       partialMergeAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
     val partialMergeAggregateResult =
-      namedGroupingAttributes ++
+        groupingAttributes ++
         distinctColumnAttributes ++
         partialMergeAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
     val partialMergeAggregate = if (usesTungstenAggregate) {
       TungstenAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes ++ distinctColumnAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
         nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
+        nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
         completeAggregateExpressions = Nil,
+        completeAggregateAttributes = Nil,
         resultExpressions = partialMergeAggregateResult,
         child = partialAggregate)
     } else {
       SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes ++ distinctColumnAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
         nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
         nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
-        initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length,
+        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
         resultExpressions = partialMergeAggregateResult,
         child = partialAggregate)
     }
 
     // 3. Create an Aggregate Operator for partial merge aggregations.
     val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
-    val finalAggregateAttributes =
-      finalAggregateExpressions.map {
-        expr => aggregateFunctionMap(expr.aggregateFunction, expr.isDistinct)._2
-      }
-    // Create a map to store those rewritten aggregate functions. We always need to use
-    // both function and its corresponding isDistinct flag as the key because function itself
-    // does not knows if it is has distinct keyword or now.
-    val rewrittenAggregateFunctions =
-      mutable.Map.empty[(AggregateFunction2, Boolean), AggregateFunction2]
+    // The attributes of the final aggregation buffer, which is presented as input to the result
+    // projection:
+    val finalAggregateAttributes = finalAggregateExpressions.map {
+      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
+    }
+
     val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map {
       // Children of an AggregateFunction with DISTINCT keyword has already
       // been evaluated. At here, we need to replace original children
@@ -328,9 +259,6 @@ object Utils {
           case expr if distinctColumnExpressionMap.contains(expr) =>
             distinctColumnExpressionMap(expr).toAttribute
         }.asInstanceOf[AggregateFunction2]
-        // Because we have rewritten the aggregate function, we use rewrittenAggregateFunctions
-        // to track the old version and the new version of this function.
-        rewrittenAggregateFunctions += (aggregateFunction, true) -> rewrittenAggregateFunction
         // We rewrite the aggregate function to a non-distinct aggregation because
         // its input will have distinct arguments.
         // We just keep the isDistinct setting to true, so when users look at the query plan,
@@ -338,66 +266,30 @@ object Utils {
         val rewrittenAggregateExpression =
           AggregateExpression2(rewrittenAggregateFunction, Complete, true)
 
-        val aggregateFunctionAttribute =
-          aggregateFunctionMap(agg.aggregateFunction, true)._2
-        (rewrittenAggregateExpression -> aggregateFunctionAttribute)
+        val aggregateFunctionAttribute = aggregateFunctionToAttribute(agg.aggregateFunction, true)
+        (rewrittenAggregateExpression, aggregateFunctionAttribute)
     }.unzip
 
     val finalAndCompleteAggregate = if (usesTungstenAggregate) {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transform {
-          case agg: AggregateExpression2 =>
-            val function = agg.aggregateFunction
-            val isDistinct = agg.isDistinct
-            val aggregateFunction =
-              if (rewrittenAggregateFunctions.contains(function, isDistinct)) {
-                // If this function has been rewritten, we get the rewritten version from
-                // rewrittenAggregateFunctions.
-                rewrittenAggregateFunctions(function, isDistinct)
-              } else {
-                // Oterwise, we get it from aggregateFunctionMap, which contains unique
-                // aggregate functions that have not been rewritten.
-                aggregateFunctionMap(function, isDistinct)._1
-              }
-            aggregateFunction.asInstanceOf[DeclarativeAggregate].evaluateExpression
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
-
       TungstenAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
+        nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = completeAggregateExpressions,
-        resultExpressions = rewrittenResultExpressions,
+        completeAggregateAttributes = completeAggregateAttributes,
+        resultExpressions = resultExpressions,
         child = partialMergeAggregate)
     } else {
-      val rewrittenResultExpressions = resultExpressions.map { expr =>
-        expr.transform {
-          case agg: AggregateExpression2 =>
-            aggregateFunctionMap(agg.aggregateFunction, agg.isDistinct)._2
-          case expression =>
-            // We do not rely on the equality check at here since attributes may
-            // different cosmetically. Instead, we use semanticEquals.
-            groupExpressionMap.collectFirst {
-              case (expr, ne) if expr semanticEquals expression => ne.toAttribute
-            }.getOrElse(expression)
-        }.asInstanceOf[NamedExpression]
-      }
       SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(namedGroupingAttributes),
-        groupingExpressions = namedGroupingAttributes,
+        requiredChildDistributionExpressions = Some(groupingAttributes),
+        groupingExpressions = groupingAttributes,
         nonCompleteAggregateExpressions = finalAggregateExpressions,
         nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = completeAggregateExpressions,
         completeAggregateAttributes = completeAggregateAttributes,
-        initialInputBufferOffset = (namedGroupingAttributes ++ distinctColumnAttributes).length,
-        resultExpressions = rewrittenResultExpressions,
+        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
+        resultExpressions = resultExpressions,
         child = partialMergeAggregate)
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index 7ca677a6c72ad..ed974b3a53d41 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -38,7 +38,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
         () => new InterpretedMutableProjection(expr, schema)
       }
       val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
-      iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty,
+      iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, Seq.empty, Seq.empty,
         Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)

From 02149ff08eed3745086589a047adbce9a580389f Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Thu, 8 Oct 2015 16:18:35 -0700
Subject: [PATCH 005/139] [SPARK-8848] [SQL] Refactors Parquet write path to
 follow parquet-format

This PR refactors Parquet write path to follow parquet-format spec.  It's a successor of PR #7679, but with less non-essential changes.

Major changes include:

1.  Replaces `RowWriteSupport` and `MutableRowWriteSupport` with `CatalystWriteSupport`

    - Writes Parquet data using standard layout defined in parquet-format

      Specifically, we are now writing ...

      - ... arrays and maps in standard 3-level structure with proper annotations and field names
      - ... decimals as `INT32` and `INT64` whenever possible, and taking `FIXED_LEN_BYTE_ARRAY` as the final fallback

    - Supports legacy mode which is compatible with Spark 1.4 and prior versions

      The legacy mode is by default off, and can be turned on by flipping SQL option `spark.sql.parquet.writeLegacyFormat` to `true`.

    - Eliminates per value data type dispatching costs via prebuilt composed writer functions

1.  Cleans up the last pieces of old Parquet support code

As pointed out by rxin previously, we probably want to rename all those `Catalyst*` Parquet classes to `Parquet*` for clarity.  But I'd like to do this in a follow-up PR to minimize code review noises in this one.

Author: Cheng Lian <lian@databricks.com>

Closes #8988 from liancheng/spark-8848/standard-parquet-write-path.
---
 .../org/apache/spark/sql/types/Decimal.scala  |   4 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |   5 +-
 .../parquet/CatalystReadSupport.scala         |  35 +-
 .../parquet/CatalystRowConverter.scala        |  59 +--
 .../parquet/CatalystSchemaConverter.scala     |  46 +-
 .../parquet/CatalystWriteSupport.scala        | 436 ++++++++++++++++++
 .../DirectParquetOutputCommitter.scala        |   2 +-
 .../parquet/ParquetConverter.scala            |  39 --
 .../datasources/parquet/ParquetFilters.scala  |  36 --
 .../datasources/parquet/ParquetRelation.scala |  42 +-
 .../parquet/ParquetTableSupport.scala         | 321 -------------
 .../parquet/ParquetTypesConverter.scala       | 160 -------
 .../spark/sql/UserDefinedTypeSuite.scala      |  32 +-
 .../datasources/parquet/ParquetIOSuite.scala  |  63 +--
 .../parquet/ParquetQuerySuite.scala           |  46 +-
 .../parquet/ParquetSchemaSuite.scala          |   4 +-
 .../datasources/parquet/ParquetTest.scala     |  44 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  |  28 +-
 18 files changed, 709 insertions(+), 693 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala
 delete mode 100644 sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 909b8e31f2458..c11dab35cdf6f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -108,7 +108,9 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    */
   def set(decimal: BigDecimal, precision: Int, scale: Int): Decimal = {
     this.decimalVal = decimal.setScale(scale, ROUNDING_MODE)
-    require(decimalVal.precision <= precision, "Overflowed precision")
+    require(
+      decimalVal.precision <= precision,
+      s"Decimal precision ${decimalVal.precision} exceeds max precision $precision")
     this.longVal = 0L
     this._precision = precision
     this._scale = scale
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 8f0f8910b36ab..47397c4be3cb6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -292,10 +292,9 @@ private[spark] object SQLConf {
 
   val PARQUET_WRITE_LEGACY_FORMAT = booleanConf(
     key = "spark.sql.parquet.writeLegacyFormat",
-    defaultValue = Some(true),
+    defaultValue = Some(false),
     doc = "Whether to follow Parquet's format specification when converting Parquet schema to " +
-      "Spark SQL schema and vice versa.",
-    isPublic = false)
+      "Spark SQL schema and vice versa.")
 
   val PARQUET_OUTPUT_COMMITTER_CLASS = stringConf(
     key = "spark.sql.parquet.output.committer.class",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
index 5325698034095..a958373eb769d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystReadSupport.scala
@@ -95,7 +95,9 @@ private[parquet] class CatalystReadSupport extends ReadSupport[InternalRow] with
        """.stripMargin
     }
 
-    new CatalystRecordMaterializer(parquetRequestedSchema, catalystRequestedSchema)
+    new CatalystRecordMaterializer(
+      parquetRequestedSchema,
+      CatalystReadSupport.expandUDT(catalystRequestedSchema))
   }
 }
 
@@ -110,7 +112,10 @@ private[parquet] object CatalystReadSupport {
    */
   def clipParquetSchema(parquetSchema: MessageType, catalystSchema: StructType): MessageType = {
     val clippedParquetFields = clipParquetGroupFields(parquetSchema.asGroupType(), catalystSchema)
-    Types.buildMessage().addFields(clippedParquetFields: _*).named("root")
+    Types
+      .buildMessage()
+      .addFields(clippedParquetFields: _*)
+      .named(CatalystSchemaConverter.SPARK_PARQUET_SCHEMA_NAME)
   }
 
   private def clipParquetType(parquetType: Type, catalystType: DataType): Type = {
@@ -271,4 +276,30 @@ private[parquet] object CatalystReadSupport {
         .getOrElse(toParquet.convertField(f))
     }
   }
+
+  def expandUDT(schema: StructType): StructType = {
+    def expand(dataType: DataType): DataType = {
+      dataType match {
+        case t: ArrayType =>
+          t.copy(elementType = expand(t.elementType))
+
+        case t: MapType =>
+          t.copy(
+            keyType = expand(t.keyType),
+            valueType = expand(t.valueType))
+
+        case t: StructType =>
+          val expandedFields = t.fields.map(f => f.copy(dataType = expand(f.dataType)))
+          t.copy(fields = expandedFields)
+
+        case t: UserDefinedType[_] =>
+          t.sqlType
+
+        case t =>
+          t
+      }
+    }
+
+    expand(schema).asInstanceOf[StructType]
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index 050d3610a6413..247d35363b862 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -27,7 +27,6 @@ import org.apache.parquet.column.Dictionary
 import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
 import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8}
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE
-import org.apache.parquet.schema.Type.Repetition
 import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type}
 
 import org.apache.spark.Logging
@@ -114,7 +113,8 @@ private[parquet] class CatalystPrimitiveConverter(val updater: ParentContainerUp
  * any "parent" container.
  *
  * @param parquetType Parquet schema of Parquet records
- * @param catalystType Spark SQL schema that corresponds to the Parquet record type
+ * @param catalystType Spark SQL schema that corresponds to the Parquet record type. User-defined
+ *        types should have been expanded.
  * @param updater An updater which propagates converted field values to the parent container
  */
 private[parquet] class CatalystRowConverter(
@@ -133,6 +133,12 @@ private[parquet] class CatalystRowConverter(
        |${catalystType.prettyJson}
      """.stripMargin)
 
+  assert(
+    !catalystType.existsRecursively(_.isInstanceOf[UserDefinedType[_]]),
+    s"""User-defined types in Catalyst schema should have already been expanded:
+       |${catalystType.prettyJson}
+     """.stripMargin)
+
   logDebug(
     s"""Building row converter for the following schema:
        |
@@ -268,13 +274,6 @@ private[parquet] class CatalystRowConverter(
           override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
         })
 
-      case t: UserDefinedType[_] =>
-        val catalystTypeForUDT = t.sqlType
-        val nullable = parquetType.isRepetition(Repetition.OPTIONAL)
-        val field = StructField("udt", catalystTypeForUDT, nullable)
-        val parquetTypeForUDT = new CatalystSchemaConverter().convertField(field)
-        newConverter(parquetTypeForUDT, catalystTypeForUDT, updater)
-
       case _ =>
         throw new RuntimeException(
           s"Unable to create Parquet converter for data type ${catalystType.json}")
@@ -340,30 +339,36 @@ private[parquet] class CatalystRowConverter(
       val scale = decimalType.scale
 
       if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
-        // Constructs a `Decimal` with an unscaled `Long` value if possible.  The underlying
-        // `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here we are using
-        // `Binary.toByteBuffer.array()` to steal the underlying byte array without copying it.
-        val buffer = value.toByteBuffer
-        val bytes = buffer.array()
-        val start = buffer.position()
-        val end = buffer.limit()
-
-        var unscaled = 0L
-        var i = start
-
-        while (i < end) {
-          unscaled = (unscaled << 8) | (bytes(i) & 0xff)
-          i += 1
-        }
-
-        val bits = 8 * (end - start)
-        unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+        // Constructs a `Decimal` with an unscaled `Long` value if possible.
+        val unscaled = binaryToUnscaledLong(value)
         Decimal(unscaled, precision, scale)
       } else {
         // Otherwise, resorts to an unscaled `BigInteger` instead.
         Decimal(new BigDecimal(new BigInteger(value.getBytes), scale), precision, scale)
       }
     }
+
+    private def binaryToUnscaledLong(binary: Binary): Long = {
+      // The underlying `ByteBuffer` implementation is guaranteed to be `HeapByteBuffer`, so here
+      // we are using `Binary.toByteBuffer.array()` to steal the underlying byte array without
+      // copying it.
+      val buffer = binary.toByteBuffer
+      val bytes = buffer.array()
+      val start = buffer.position()
+      val end = buffer.limit()
+
+      var unscaled = 0L
+      var i = start
+
+      while (i < end) {
+        unscaled = (unscaled << 8) | (bytes(i) & 0xff)
+        i += 1
+      }
+
+      val bits = 8 * (end - start)
+      unscaled = (unscaled << (64 - bits)) >> (64 - bits)
+      unscaled
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
index 6904fc736c106..7f3394c20ed3d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystSchemaConverter.scala
@@ -121,7 +121,7 @@ private[parquet] class CatalystSchemaConverter(
       val precision = field.getDecimalMetadata.getPrecision
       val scale = field.getDecimalMetadata.getScale
 
-      CatalystSchemaConverter.analysisRequire(
+      CatalystSchemaConverter.checkConversionRequirement(
         maxPrecision == -1 || 1 <= precision && precision <= maxPrecision,
         s"Invalid decimal precision: $typeName cannot store $precision digits (max $maxPrecision)")
 
@@ -155,7 +155,7 @@ private[parquet] class CatalystSchemaConverter(
         }
 
       case INT96 =>
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           assumeInt96IsTimestamp,
           "INT96 is not supported unless it's interpreted as timestamp. " +
             s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.")
@@ -197,11 +197,11 @@ private[parquet] class CatalystSchemaConverter(
       //
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
       case LIST =>
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           field.getFieldCount == 1, s"Invalid list type $field")
 
         val repeatedType = field.getType(0)
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           repeatedType.isRepetition(REPEATED), s"Invalid list type $field")
 
         if (isElementType(repeatedType, field.getName)) {
@@ -217,17 +217,17 @@ private[parquet] class CatalystSchemaConverter(
       // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules-1
       // scalastyle:on
       case MAP | MAP_KEY_VALUE =>
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           field.getFieldCount == 1 && !field.getType(0).isPrimitive,
           s"Invalid map type: $field")
 
         val keyValueType = field.getType(0).asGroupType()
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           keyValueType.isRepetition(REPEATED) && keyValueType.getFieldCount == 2,
           s"Invalid map type: $field")
 
         val keyType = keyValueType.getType(0)
-        CatalystSchemaConverter.analysisRequire(
+        CatalystSchemaConverter.checkConversionRequirement(
           keyType.isPrimitive,
           s"Map key type is expected to be a primitive type, but found: $keyType")
 
@@ -299,7 +299,10 @@ private[parquet] class CatalystSchemaConverter(
    * Converts a Spark SQL [[StructType]] to a Parquet [[MessageType]].
    */
   def convert(catalystSchema: StructType): MessageType = {
-    Types.buildMessage().addFields(catalystSchema.map(convertField): _*).named("root")
+    Types
+      .buildMessage()
+      .addFields(catalystSchema.map(convertField): _*)
+      .named(CatalystSchemaConverter.SPARK_PARQUET_SCHEMA_NAME)
   }
 
   /**
@@ -347,10 +350,10 @@ private[parquet] class CatalystSchemaConverter(
       // NOTE: Spark SQL TimestampType is NOT a well defined type in Parquet format spec.
       //
       // As stated in PARQUET-323, Parquet `INT96` was originally introduced to represent nanosecond
-      // timestamp in Impala for some historical reasons, it's not recommended to be used for any
-      // other types and will probably be deprecated in future Parquet format spec.  That's the
-      // reason why Parquet format spec only defines `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS` which
-      // are both logical types annotating `INT64`.
+      // timestamp in Impala for some historical reasons.  It's not recommended to be used for any
+      // other types and will probably be deprecated in some future version of parquet-format spec.
+      // That's the reason why parquet-format spec only defines `TIMESTAMP_MILLIS` and
+      // `TIMESTAMP_MICROS` which are both logical types annotating `INT64`.
       //
       // Originally, Spark SQL uses the same nanosecond timestamp type as Impala and Hive.  Starting
       // from Spark 1.5.0, we resort to a timestamp type with 100 ns precision so that we can store
@@ -361,7 +364,7 @@ private[parquet] class CatalystSchemaConverter(
       // currently not implemented yet because parquet-mr 1.7.0 (the version we're currently using)
       // hasn't implemented `TIMESTAMP_MICROS` yet.
       //
-      // TODO Implements `TIMESTAMP_MICROS` once parquet-mr has that.
+      // TODO Converts `TIMESTAMP_MICROS` once parquet-mr implements that.
       case TimestampType =>
         Types.primitive(INT96, repetition).named(field.name)
 
@@ -523,11 +526,12 @@ private[parquet] class CatalystSchemaConverter(
   }
 }
 
-
 private[parquet] object CatalystSchemaConverter {
+  val SPARK_PARQUET_SCHEMA_NAME = "spark_schema"
+
   def checkFieldName(name: String): Unit = {
     // ,;{}()\n\t= and space are special characters in Parquet schema
-    analysisRequire(
+    checkConversionRequirement(
       !name.matches(".*[ ,;{}()\n\t=].*"),
       s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=".
          |Please use alias to rename it.
@@ -539,7 +543,7 @@ private[parquet] object CatalystSchemaConverter {
     schema
   }
 
-  def analysisRequire(f: => Boolean, message: String): Unit = {
+  def checkConversionRequirement(f: => Boolean, message: String): Unit = {
     if (!f) {
       throw new AnalysisException(message)
     }
@@ -553,16 +557,8 @@ private[parquet] object CatalystSchemaConverter {
     numBytes
   }
 
-  private val MIN_BYTES_FOR_PRECISION = Array.tabulate[Int](39)(computeMinBytesForPrecision)
-
   // Returns the minimum number of bytes needed to store a decimal with a given `precision`.
-  def minBytesForPrecision(precision : Int) : Int = {
-    if (precision < MIN_BYTES_FOR_PRECISION.length) {
-      MIN_BYTES_FOR_PRECISION(precision)
-    } else {
-      computeMinBytesForPrecision(precision)
-    }
-  }
+  val minBytesForPrecision = Array.tabulate[Int](39)(computeMinBytesForPrecision)
 
   val MAX_PRECISION_FOR_INT32 = maxPrecisionForBytes(4) /* 9 */
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala
new file mode 100644
index 0000000000000..483363d2c1a21
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystWriteSupport.scala
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.parquet
+
+import java.nio.{ByteBuffer, ByteOrder}
+import java.util
+
+import scala.collection.JavaConverters.mapAsJavaMapConverter
+
+import org.apache.hadoop.conf.Configuration
+import org.apache.parquet.column.ParquetProperties
+import org.apache.parquet.hadoop.ParquetOutputFormat
+import org.apache.parquet.hadoop.api.WriteSupport
+import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
+import org.apache.parquet.io.api.{Binary, RecordConsumer}
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.SQLConf
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions.SpecializedGetters
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.datasources.parquet.CatalystSchemaConverter.{MAX_PRECISION_FOR_INT32, MAX_PRECISION_FOR_INT64, minBytesForPrecision}
+import org.apache.spark.sql.types._
+
+/**
+ * A Parquet [[WriteSupport]] implementation that writes Catalyst [[InternalRow]]s as Parquet
+ * messages.  This class can write Parquet data in two modes:
+ *
+ *  - Standard mode: Parquet data are written in standard format defined in parquet-format spec.
+ *  - Legacy mode: Parquet data are written in legacy format compatible with Spark 1.4 and prior.
+ *
+ * This behavior can be controlled by SQL option `spark.sql.parquet.writeLegacyFormat`.  The value
+ * of this option is propagated to this class by the `init()` method and its Hadoop configuration
+ * argument.
+ */
+private[parquet] class CatalystWriteSupport extends WriteSupport[InternalRow] with Logging {
+  // A `ValueWriter` is responsible for writing a field of an `InternalRow` to the record consumer.
+  // Here we are using `SpecializedGetters` rather than `InternalRow` so that we can directly access
+  // data in `ArrayData` without the help of `SpecificMutableRow`.
+  private type ValueWriter = (SpecializedGetters, Int) => Unit
+
+  // Schema of the `InternalRow`s to be written
+  private var schema: StructType = _
+
+  // `ValueWriter`s for all fields of the schema
+  private var rootFieldWriters: Seq[ValueWriter] = _
+
+  // The Parquet `RecordConsumer` to which all `InternalRow`s are written
+  private var recordConsumer: RecordConsumer = _
+
+  // Whether to write data in legacy Parquet format compatible with Spark 1.4 and prior versions
+  private var writeLegacyParquetFormat: Boolean = _
+
+  // Reusable byte array used to write timestamps as Parquet INT96 values
+  private val timestampBuffer = new Array[Byte](12)
+
+  // Reusable byte array used to write decimal values
+  private val decimalBuffer = new Array[Byte](minBytesForPrecision(DecimalType.MAX_PRECISION))
+
+  override def init(configuration: Configuration): WriteContext = {
+    val schemaString = configuration.get(CatalystWriteSupport.SPARK_ROW_SCHEMA)
+    this.schema = StructType.fromString(schemaString)
+    this.writeLegacyParquetFormat = {
+      // `SQLConf.PARQUET_WRITE_LEGACY_FORMAT` should always be explicitly set in ParquetRelation
+      assert(configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key) != null)
+      configuration.get(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key).toBoolean
+    }
+    this.rootFieldWriters = schema.map(_.dataType).map(makeWriter)
+
+    val messageType = new CatalystSchemaConverter(configuration).convert(schema)
+    val metadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schemaString).asJava
+
+    logInfo(
+      s"""Initialized Parquet WriteSupport with Catalyst schema:
+         |${schema.prettyJson}
+         |and corresponding Parquet message type:
+         |$messageType
+       """.stripMargin)
+
+    new WriteContext(messageType, metadata)
+  }
+
+  override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
+    this.recordConsumer = recordConsumer
+  }
+
+  override def write(row: InternalRow): Unit = {
+    consumeMessage {
+      writeFields(row, schema, rootFieldWriters)
+    }
+  }
+
+  private def writeFields(
+      row: InternalRow, schema: StructType, fieldWriters: Seq[ValueWriter]): Unit = {
+    var i = 0
+    while (i < row.numFields) {
+      if (!row.isNullAt(i)) {
+        consumeField(schema(i).name, i) {
+          fieldWriters(i).apply(row, i)
+        }
+      }
+      i += 1
+    }
+  }
+
+  private def makeWriter(dataType: DataType): ValueWriter = {
+    dataType match {
+      case BooleanType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addBoolean(row.getBoolean(ordinal))
+
+      case ByteType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addInteger(row.getByte(ordinal))
+
+      case ShortType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addInteger(row.getShort(ordinal))
+
+      case IntegerType | DateType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addInteger(row.getInt(ordinal))
+
+      case LongType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addLong(row.getLong(ordinal))
+
+      case FloatType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addFloat(row.getFloat(ordinal))
+
+      case DoubleType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addDouble(row.getDouble(ordinal))
+
+      case StringType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addBinary(Binary.fromByteArray(row.getUTF8String(ordinal).getBytes))
+
+      case TimestampType =>
+        (row: SpecializedGetters, ordinal: Int) => {
+          // TODO Writes `TimestampType` values as `TIMESTAMP_MICROS` once parquet-mr implements it
+          // Currently we only support timestamps stored as INT96, which is compatible with Hive
+          // and Impala.  However, INT96 is to be deprecated.  We plan to support `TIMESTAMP_MICROS`
+          // defined in the parquet-format spec.  But up until writing, the most recent parquet-mr
+          // version (1.8.1) hasn't implemented it yet.
+
+          // NOTE: Starting from Spark 1.5, Spark SQL `TimestampType` only has microsecond
+          // precision.  Nanosecond parts of timestamp values read from INT96 are simply stripped.
+          val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(row.getLong(ordinal))
+          val buf = ByteBuffer.wrap(timestampBuffer)
+          buf.order(ByteOrder.LITTLE_ENDIAN).putLong(timeOfDayNanos).putInt(julianDay)
+          recordConsumer.addBinary(Binary.fromByteArray(timestampBuffer))
+        }
+
+      case BinaryType =>
+        (row: SpecializedGetters, ordinal: Int) =>
+          recordConsumer.addBinary(Binary.fromByteArray(row.getBinary(ordinal)))
+
+      case DecimalType.Fixed(precision, scale) =>
+        makeDecimalWriter(precision, scale)
+
+      case t: StructType =>
+        val fieldWriters = t.map(_.dataType).map(makeWriter)
+        (row: SpecializedGetters, ordinal: Int) =>
+          consumeGroup {
+            writeFields(row.getStruct(ordinal, t.length), t, fieldWriters)
+          }
+
+      case t: ArrayType => makeArrayWriter(t)
+
+      case t: MapType => makeMapWriter(t)
+
+      case t: UserDefinedType[_] => makeWriter(t.sqlType)
+
+      // TODO Adds IntervalType support
+      case _ => sys.error(s"Unsupported data type $dataType.")
+    }
+  }
+
+  private def makeDecimalWriter(precision: Int, scale: Int): ValueWriter = {
+    assert(
+      precision <= DecimalType.MAX_PRECISION,
+      s"Decimal precision $precision exceeds max precision ${DecimalType.MAX_PRECISION}")
+
+    val numBytes = minBytesForPrecision(precision)
+
+    val int32Writer =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val unscaledLong = row.getDecimal(ordinal, precision, scale).toUnscaledLong
+        recordConsumer.addInteger(unscaledLong.toInt)
+      }
+
+    val int64Writer =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val unscaledLong = row.getDecimal(ordinal, precision, scale).toUnscaledLong
+        recordConsumer.addLong(unscaledLong)
+      }
+
+    val binaryWriterUsingUnscaledLong =
+      (row: SpecializedGetters, ordinal: Int) => {
+        // When the precision is low enough (<= 18) to squeeze the decimal value into a `Long`, we
+        // can build a fixed-length byte array with length `numBytes` using the unscaled `Long`
+        // value and the `decimalBuffer` for better performance.
+        val unscaled = row.getDecimal(ordinal, precision, scale).toUnscaledLong
+        var i = 0
+        var shift = 8 * (numBytes - 1)
+
+        while (i < numBytes) {
+          decimalBuffer(i) = (unscaled >> shift).toByte
+          i += 1
+          shift -= 8
+        }
+
+        recordConsumer.addBinary(Binary.fromByteArray(decimalBuffer, 0, numBytes))
+      }
+
+    val binaryWriterUsingUnscaledBytes =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val decimal = row.getDecimal(ordinal, precision, scale)
+        val bytes = decimal.toJavaBigDecimal.unscaledValue().toByteArray
+        val fixedLengthBytes = if (bytes.length == numBytes) {
+          // If the length of the underlying byte array of the unscaled `BigInteger` happens to be
+          // `numBytes`, just reuse it, so that we don't bother copying it to `decimalBuffer`.
+          bytes
+        } else {
+          // Otherwise, the length must be less than `numBytes`.  In this case we copy contents of
+          // the underlying bytes with padding sign bytes to `decimalBuffer` to form the result
+          // fixed-length byte array.
+          val signByte = if (bytes.head < 0) -1: Byte else 0: Byte
+          util.Arrays.fill(decimalBuffer, 0, numBytes - bytes.length, signByte)
+          System.arraycopy(bytes, 0, decimalBuffer, numBytes - bytes.length, bytes.length)
+          decimalBuffer
+        }
+
+        recordConsumer.addBinary(Binary.fromByteArray(fixedLengthBytes, 0, numBytes))
+      }
+
+    writeLegacyParquetFormat match {
+      // Standard mode, 1 <= precision <= 9, writes as INT32
+      case false if precision <= MAX_PRECISION_FOR_INT32 => int32Writer
+
+      // Standard mode, 10 <= precision <= 18, writes as INT64
+      case false if precision <= MAX_PRECISION_FOR_INT64 => int64Writer
+
+      // Legacy mode, 1 <= precision <= 18, writes as FIXED_LEN_BYTE_ARRAY
+      case true if precision <= MAX_PRECISION_FOR_INT64 => binaryWriterUsingUnscaledLong
+
+      // Either standard or legacy mode, 19 <= precision <= 38, writes as FIXED_LEN_BYTE_ARRAY
+      case _ => binaryWriterUsingUnscaledBytes
+    }
+  }
+
+  def makeArrayWriter(arrayType: ArrayType): ValueWriter = {
+    val elementWriter = makeWriter(arrayType.elementType)
+
+    def threeLevelArrayWriter(repeatedGroupName: String, elementFieldName: String): ValueWriter =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val array = row.getArray(ordinal)
+        consumeGroup {
+          // Only creates the repeated field if the array is non-empty.
+          if (array.numElements() > 0) {
+            consumeField(repeatedGroupName, 0) {
+              var i = 0
+              while (i < array.numElements()) {
+                consumeGroup {
+                  // Only creates the element field if the current array element is not null.
+                  if (!array.isNullAt(i)) {
+                    consumeField(elementFieldName, 0) {
+                      elementWriter.apply(array, i)
+                    }
+                  }
+                }
+                i += 1
+              }
+            }
+          }
+        }
+      }
+
+    def twoLevelArrayWriter(repeatedFieldName: String): ValueWriter =
+      (row: SpecializedGetters, ordinal: Int) => {
+        val array = row.getArray(ordinal)
+        consumeGroup {
+          // Only creates the repeated field if the array is non-empty.
+          if (array.numElements() > 0) {
+            consumeField(repeatedFieldName, 0) {
+              var i = 0
+              while (i < array.numElements()) {
+                elementWriter.apply(array, i)
+                i += 1
+              }
+            }
+          }
+        }
+      }
+
+    (writeLegacyParquetFormat, arrayType.containsNull) match {
+      case (legacyMode @ false, _) =>
+        // Standard mode:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     repeated group list {
+        //                    ^~~~  repeatedGroupName
+        //       <element-repetition> <element-type> element;
+        //                                           ^~~~~~~  elementFieldName
+        //     }
+        //   }
+        threeLevelArrayWriter(repeatedGroupName = "list", elementFieldName = "element")
+
+      case (legacyMode @ true, nullableElements @ true) =>
+        // Legacy mode, with nullable elements:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     optional group bag {
+        //                    ^~~  repeatedGroupName
+        //       repeated <element-type> array;
+        //                               ^~~~~ elementFieldName
+        //     }
+        //   }
+        threeLevelArrayWriter(repeatedGroupName = "bag", elementFieldName = "array")
+
+      case (legacyMode @ true, nullableElements @ false) =>
+        // Legacy mode, with non-nullable elements:
+        //
+        //   <list-repetition> group <name> (LIST) {
+        //     repeated <element-type> array;
+        //                             ^~~~~  repeatedFieldName
+        //   }
+        twoLevelArrayWriter(repeatedFieldName = "array")
+    }
+  }
+
+  private def makeMapWriter(mapType: MapType): ValueWriter = {
+    val keyWriter = makeWriter(mapType.keyType)
+    val valueWriter = makeWriter(mapType.valueType)
+    val repeatedGroupName = if (writeLegacyParquetFormat) {
+      // Legacy mode:
+      //
+      //   <map-repetition> group <name> (MAP) {
+      //     repeated group map (MAP_KEY_VALUE) {
+      //                    ^~~  repeatedGroupName
+      //       required <key-type> key;
+      //       <value-repetition> <value-type> value;
+      //     }
+      //   }
+      "map"
+    } else {
+      // Standard mode:
+      //
+      //   <map-repetition> group <name> (MAP) {
+      //     repeated group key_value {
+      //                    ^~~~~~~~~  repeatedGroupName
+      //       required <key-type> key;
+      //       <value-repetition> <value-type> value;
+      //     }
+      //   }
+      "key_value"
+    }
+
+    (row: SpecializedGetters, ordinal: Int) => {
+      val map = row.getMap(ordinal)
+      val keyArray = map.keyArray()
+      val valueArray = map.valueArray()
+
+      consumeGroup {
+        // Only creates the repeated field if the map is non-empty.
+        if (map.numElements() > 0) {
+          consumeField(repeatedGroupName, 0) {
+            var i = 0
+            while (i < map.numElements()) {
+              consumeGroup {
+                consumeField("key", 0) {
+                  keyWriter.apply(keyArray, i)
+                }
+
+                // Only creates the "value" field if the value if non-empty
+                if (!map.valueArray().isNullAt(i)) {
+                  consumeField("value", 1) {
+                    valueWriter.apply(valueArray, i)
+                  }
+                }
+              }
+              i += 1
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private def consumeMessage(f: => Unit): Unit = {
+    recordConsumer.startMessage()
+    f
+    recordConsumer.endMessage()
+  }
+
+  private def consumeGroup(f: => Unit): Unit = {
+    recordConsumer.startGroup()
+    f
+    recordConsumer.endGroup()
+  }
+
+  private def consumeField(field: String, index: Int)(f: => Unit): Unit = {
+    recordConsumer.startField(field, index)
+    f
+    recordConsumer.endField(field, index)
+  }
+}
+
+private[parquet] object CatalystWriteSupport {
+  val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
+
+  def setSchema(schema: StructType, configuration: Configuration): Unit = {
+    schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName)
+    configuration.set(SPARK_ROW_SCHEMA, schema.json)
+    configuration.set(
+      ParquetOutputFormat.WRITER_VERSION,
+      ParquetProperties.WriterVersion.PARQUET_1_0.toString)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
index de1fd0166ac5a..300e8677b312f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/DirectParquetOutputCommitter.scala
@@ -39,7 +39,7 @@ import org.apache.parquet.hadoop.{ParquetFileReader, ParquetFileWriter, ParquetO
  *
  *   NEVER use [[DirectParquetOutputCommitter]] when appending data, because currently there's
  *   no safe way undo a failed appending job (that's why both `abortTask()` and `abortJob()` are
- *   left * empty).
+ *   left empty).
  */
 private[parquet] class DirectParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
   extends ParquetOutputCommitter(outputPath, context) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala
deleted file mode 100644
index ccd7ebf319af9..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetConverter.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.parquet
-
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.{MapData, ArrayData}
-
-// TODO Removes this while fixing SPARK-8848
-private[sql] object CatalystConverter {
-  // This is mostly Parquet convention (see, e.g., `ConversionPatterns`).
-  // Note that "array" for the array elements is chosen by ParquetAvro.
-  // Using a different value will result in Parquet silently dropping columns.
-  val ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME = "bag"
-  val ARRAY_ELEMENTS_SCHEMA_NAME = "array"
-
-  val MAP_KEY_SCHEMA_NAME = "key"
-  val MAP_VALUE_SCHEMA_NAME = "value"
-  val MAP_SCHEMA_NAME = "map"
-
-  // TODO: consider using Array[T] for arrays to avoid boxing of primitive types
-  type ArrayScalaType = ArrayData
-  type StructScalaType = InternalRow
-  type MapScalaType = MapData
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index c6b3fe7900da8..78040d99fb0a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -18,24 +18,17 @@
 package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.Serializable
-import java.nio.ByteBuffer
 
-import com.google.common.io.BaseEncoding
-import org.apache.hadoop.conf.Configuration
 import org.apache.parquet.filter2.predicate.FilterApi._
 import org.apache.parquet.filter2.predicate._
 import org.apache.parquet.io.api.Binary
 import org.apache.parquet.schema.OriginalType
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 
-import org.apache.spark.SparkEnv
-import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types._
 
 private[sql] object ParquetFilters {
-  val PARQUET_FILTER_DATA = "org.apache.spark.sql.parquet.row.filter"
-
   case class SetInFilter[T <: Comparable[T]](
     valueSet: Set[T]) extends UserDefinedPredicate[T] with Serializable {
 
@@ -282,33 +275,4 @@ private[sql] object ParquetFilters {
     addMethod.setAccessible(true)
     addMethod.invoke(null, classOf[Binary], enumTypeDescriptor)
   }
-
-  /**
-   * Note: Inside the Hadoop API we only have access to `Configuration`, not to
-   * [[org.apache.spark.SparkContext]], so we cannot use broadcasts to convey
-   * the actual filter predicate.
-   */
-  def serializeFilterExpressions(filters: Seq[Expression], conf: Configuration): Unit = {
-    if (filters.nonEmpty) {
-      val serialized: Array[Byte] =
-        SparkEnv.get.closureSerializer.newInstance().serialize(filters).array()
-      val encoded: String = BaseEncoding.base64().encode(serialized)
-      conf.set(PARQUET_FILTER_DATA, encoded)
-    }
-  }
-
-  /**
-   * Note: Inside the Hadoop API we only have access to `Configuration`, not to
-   * [[org.apache.spark.SparkContext]], so we cannot use broadcasts to convey
-   * the actual filter predicate.
-   */
-  def deserializeFilterExpressions(conf: Configuration): Seq[Expression] = {
-    val data = conf.get(PARQUET_FILTER_DATA)
-    if (data != null) {
-      val decoded: Array[Byte] = BaseEncoding.base64().decode(data)
-      SparkEnv.get.closureSerializer.newInstance().deserialize(ByteBuffer.wrap(decoded))
-    } else {
-      Seq()
-    }
-  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
index 8a9c0e733a9a1..77d851ca486b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRelation.scala
@@ -218,8 +218,8 @@ private[sql] class ParquetRelation(
     }
 
     // SPARK-9849 DirectParquetOutputCommitter qualified name should be backward compatible
-    val committerClassname = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
-    if (committerClassname == "org.apache.spark.sql.parquet.DirectParquetOutputCommitter") {
+    val committerClassName = conf.get(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key)
+    if (committerClassName == "org.apache.spark.sql.parquet.DirectParquetOutputCommitter") {
       conf.set(SQLConf.PARQUET_OUTPUT_COMMITTER_CLASS.key,
         classOf[DirectParquetOutputCommitter].getCanonicalName)
     }
@@ -248,18 +248,22 @@ private[sql] class ParquetRelation(
     // bundled with `ParquetOutputFormat[Row]`.
     job.setOutputFormatClass(classOf[ParquetOutputFormat[Row]])
 
-    // TODO There's no need to use two kinds of WriteSupport
-    // We should unify them. `SpecificMutableRow` can process both atomic (primitive) types and
-    // complex types.
-    val writeSupportClass =
-      if (dataSchema.map(_.dataType).forall(ParquetTypesConverter.isPrimitiveType)) {
-        classOf[MutableRowWriteSupport]
-      } else {
-        classOf[RowWriteSupport]
-      }
+    ParquetOutputFormat.setWriteSupportClass(job, classOf[CatalystWriteSupport])
+    CatalystWriteSupport.setSchema(dataSchema, conf)
+
+    // Sets flags for `CatalystSchemaConverter` (which converts Catalyst schema to Parquet schema)
+    // and `CatalystWriteSupport` (writing actual rows to Parquet files).
+    conf.set(
+      SQLConf.PARQUET_BINARY_AS_STRING.key,
+      sqlContext.conf.isParquetBinaryAsString.toString)
 
-    ParquetOutputFormat.setWriteSupportClass(job, writeSupportClass)
-    RowWriteSupport.setSchema(dataSchema.toAttributes, conf)
+    conf.set(
+      SQLConf.PARQUET_INT96_AS_TIMESTAMP.key,
+      sqlContext.conf.isParquetINT96AsTimestamp.toString)
+
+    conf.set(
+      SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key,
+      sqlContext.conf.writeLegacyParquetFormat.toString)
 
     // Sets compression scheme
     conf.set(
@@ -287,7 +291,6 @@ private[sql] class ParquetRelation(
     val parquetFilterPushDown = sqlContext.conf.parquetFilterPushDown
     val assumeBinaryIsString = sqlContext.conf.isParquetBinaryAsString
     val assumeInt96IsTimestamp = sqlContext.conf.isParquetINT96AsTimestamp
-    val writeLegacyParquetFormat = sqlContext.conf.writeLegacyParquetFormat
 
     // Parquet row group size. We will use this value as the value for
     // mapreduce.input.fileinputformat.split.minsize and mapred.min.split.size if the value
@@ -304,8 +307,7 @@ private[sql] class ParquetRelation(
         useMetadataCache,
         parquetFilterPushDown,
         assumeBinaryIsString,
-        assumeInt96IsTimestamp,
-        writeLegacyParquetFormat) _
+        assumeInt96IsTimestamp) _
 
     // Create the function to set input paths at the driver side.
     val setInputPaths =
@@ -530,8 +532,7 @@ private[sql] object ParquetRelation extends Logging {
       useMetadataCache: Boolean,
       parquetFilterPushDown: Boolean,
       assumeBinaryIsString: Boolean,
-      assumeInt96IsTimestamp: Boolean,
-      writeLegacyParquetFormat: Boolean)(job: Job): Unit = {
+      assumeInt96IsTimestamp: Boolean)(job: Job): Unit = {
     val conf = SparkHadoopUtil.get.getConfigurationFromJobContext(job)
     conf.set(ParquetInputFormat.READ_SUPPORT_CLASS, classOf[CatalystReadSupport].getName)
 
@@ -552,16 +553,15 @@ private[sql] object ParquetRelation extends Logging {
     })
 
     conf.set(
-      RowWriteSupport.SPARK_ROW_SCHEMA,
+      CatalystWriteSupport.SPARK_ROW_SCHEMA,
       CatalystSchemaConverter.checkFieldNames(dataSchema).json)
 
     // Tell FilteringParquetRowInputFormat whether it's okay to cache Parquet and FS metadata
     conf.setBoolean(SQLConf.PARQUET_CACHE_METADATA.key, useMetadataCache)
 
-    // Sets flags for Parquet schema conversion
+    // Sets flags for `CatalystSchemaConverter`
     conf.setBoolean(SQLConf.PARQUET_BINARY_AS_STRING.key, assumeBinaryIsString)
     conf.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP.key, assumeInt96IsTimestamp)
-    conf.setBoolean(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key, writeLegacyParquetFormat)
 
     overrideMinSplitSize(parquetBlockSize, conf)
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala
deleted file mode 100644
index ed89aa27aa1f0..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTableSupport.scala
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.parquet
-
-import java.math.BigInteger
-import java.nio.{ByteBuffer, ByteOrder}
-import java.util.{HashMap => JHashMap}
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.parquet.column.ParquetProperties
-import org.apache.parquet.hadoop.ParquetOutputFormat
-import org.apache.parquet.hadoop.api.WriteSupport
-import org.apache.parquet.io.api._
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.types._
-import org.apache.spark.unsafe.types.UTF8String
-
-/**
- * A `parquet.hadoop.api.WriteSupport` for Row objects.
- */
-private[parquet] class RowWriteSupport extends WriteSupport[InternalRow] with Logging {
-
-  private[parquet] var writer: RecordConsumer = null
-  private[parquet] var attributes: Array[Attribute] = null
-
-  override def init(configuration: Configuration): WriteSupport.WriteContext = {
-    val origAttributesStr: String = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    val metadata = new JHashMap[String, String]()
-    metadata.put(CatalystReadSupport.SPARK_METADATA_KEY, origAttributesStr)
-
-    if (attributes == null) {
-      attributes = ParquetTypesConverter.convertFromString(origAttributesStr).toArray
-    }
-
-    log.debug(s"write support initialized for requested schema $attributes")
-    new WriteSupport.WriteContext(ParquetTypesConverter.convertFromAttributes(attributes), metadata)
-  }
-
-  override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
-    writer = recordConsumer
-    log.debug(s"preparing for write with schema $attributes")
-  }
-
-  override def write(record: InternalRow): Unit = {
-    val attributesSize = attributes.size
-    if (attributesSize > record.numFields) {
-      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
-        s"($attributesSize > ${record.numFields})")
-    }
-
-    var index = 0
-    writer.startMessage()
-    while(index < attributesSize) {
-      // null values indicate optional fields but we do not check currently
-      if (!record.isNullAt(index)) {
-        writer.startField(attributes(index).name, index)
-        writeValue(attributes(index).dataType, record.get(index, attributes(index).dataType))
-        writer.endField(attributes(index).name, index)
-      }
-      index = index + 1
-    }
-    writer.endMessage()
-  }
-
-  private[parquet] def writeValue(schema: DataType, value: Any): Unit = {
-    if (value != null) {
-      schema match {
-        case t: UserDefinedType[_] => writeValue(t.sqlType, value)
-        case t @ ArrayType(_, _) => writeArray(
-          t,
-          value.asInstanceOf[CatalystConverter.ArrayScalaType])
-        case t @ MapType(_, _, _) => writeMap(
-          t,
-          value.asInstanceOf[CatalystConverter.MapScalaType])
-        case t @ StructType(_) => writeStruct(
-          t,
-          value.asInstanceOf[CatalystConverter.StructScalaType])
-        case _ => writePrimitive(schema.asInstanceOf[AtomicType], value)
-      }
-    }
-  }
-
-  private[parquet] def writePrimitive(schema: DataType, value: Any): Unit = {
-    if (value != null) {
-      schema match {
-        case BooleanType => writer.addBoolean(value.asInstanceOf[Boolean])
-        case ByteType => writer.addInteger(value.asInstanceOf[Byte])
-        case ShortType => writer.addInteger(value.asInstanceOf[Short])
-        case IntegerType | DateType => writer.addInteger(value.asInstanceOf[Int])
-        case LongType => writer.addLong(value.asInstanceOf[Long])
-        case TimestampType => writeTimestamp(value.asInstanceOf[Long])
-        case FloatType => writer.addFloat(value.asInstanceOf[Float])
-        case DoubleType => writer.addDouble(value.asInstanceOf[Double])
-        case StringType => writer.addBinary(
-          Binary.fromByteArray(value.asInstanceOf[UTF8String].getBytes))
-        case BinaryType => writer.addBinary(
-          Binary.fromByteArray(value.asInstanceOf[Array[Byte]]))
-        case DecimalType.Fixed(precision, _) =>
-          writeDecimal(value.asInstanceOf[Decimal], precision)
-        case _ => sys.error(s"Do not know how to writer $schema to consumer")
-      }
-    }
-  }
-
-  private[parquet] def writeStruct(
-      schema: StructType,
-      struct: CatalystConverter.StructScalaType): Unit = {
-    if (struct != null) {
-      val fields = schema.fields.toArray
-      writer.startGroup()
-      var i = 0
-      while(i < fields.length) {
-        if (!struct.isNullAt(i)) {
-          writer.startField(fields(i).name, i)
-          writeValue(fields(i).dataType, struct.get(i, fields(i).dataType))
-          writer.endField(fields(i).name, i)
-        }
-        i = i + 1
-      }
-      writer.endGroup()
-    }
-  }
-
-  private[parquet] def writeArray(
-      schema: ArrayType,
-      array: CatalystConverter.ArrayScalaType): Unit = {
-    val elementType = schema.elementType
-    writer.startGroup()
-    if (array.numElements() > 0) {
-      if (schema.containsNull) {
-        writer.startField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0)
-        var i = 0
-        while (i < array.numElements()) {
-          writer.startGroup()
-          if (!array.isNullAt(i)) {
-            writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-            writeValue(elementType, array.get(i, elementType))
-            writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-          }
-          writer.endGroup()
-          i = i + 1
-        }
-        writer.endField(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME, 0)
-      } else {
-        writer.startField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-        var i = 0
-        while (i < array.numElements()) {
-          writeValue(elementType, array.get(i, elementType))
-          i = i + 1
-        }
-        writer.endField(CatalystConverter.ARRAY_ELEMENTS_SCHEMA_NAME, 0)
-      }
-    }
-    writer.endGroup()
-  }
-
-  private[parquet] def writeMap(
-      schema: MapType,
-      map: CatalystConverter.MapScalaType): Unit = {
-    writer.startGroup()
-    val length = map.numElements()
-    if (length > 0) {
-      writer.startField(CatalystConverter.MAP_SCHEMA_NAME, 0)
-      map.foreach(schema.keyType, schema.valueType, (key, value) => {
-        writer.startGroup()
-        writer.startField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0)
-        writeValue(schema.keyType, key)
-        writer.endField(CatalystConverter.MAP_KEY_SCHEMA_NAME, 0)
-        if (value != null) {
-          writer.startField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1)
-          writeValue(schema.valueType, value)
-          writer.endField(CatalystConverter.MAP_VALUE_SCHEMA_NAME, 1)
-        }
-        writer.endGroup()
-      })
-      writer.endField(CatalystConverter.MAP_SCHEMA_NAME, 0)
-    }
-    writer.endGroup()
-  }
-
-  // Scratch array used to write decimals as fixed-length byte array
-  private[this] var reusableDecimalBytes = new Array[Byte](16)
-
-  private[parquet] def writeDecimal(decimal: Decimal, precision: Int): Unit = {
-    val numBytes = CatalystSchemaConverter.minBytesForPrecision(precision)
-
-    def longToBinary(unscaled: Long): Binary = {
-      var i = 0
-      var shift = 8 * (numBytes - 1)
-      while (i < numBytes) {
-        reusableDecimalBytes(i) = (unscaled >> shift).toByte
-        i += 1
-        shift -= 8
-      }
-      Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
-    }
-
-    def bigIntegerToBinary(unscaled: BigInteger): Binary = {
-      unscaled.toByteArray match {
-        case bytes if bytes.length == numBytes =>
-          Binary.fromByteArray(bytes)
-
-        case bytes if bytes.length <= reusableDecimalBytes.length =>
-          val signedByte = (if (bytes.head < 0) -1 else 0).toByte
-          java.util.Arrays.fill(reusableDecimalBytes, 0, numBytes - bytes.length, signedByte)
-          System.arraycopy(bytes, 0, reusableDecimalBytes, numBytes - bytes.length, bytes.length)
-          Binary.fromByteArray(reusableDecimalBytes, 0, numBytes)
-
-        case bytes =>
-          reusableDecimalBytes = new Array[Byte](bytes.length)
-          bigIntegerToBinary(unscaled)
-      }
-    }
-
-    val binary = if (numBytes <= 8) {
-      longToBinary(decimal.toUnscaledLong)
-    } else {
-      bigIntegerToBinary(decimal.toJavaBigDecimal.unscaledValue())
-    }
-
-    writer.addBinary(binary)
-  }
-
-  // array used to write Timestamp as Int96 (fixed-length binary)
-  private[this] val int96buf = new Array[Byte](12)
-
-  private[parquet] def writeTimestamp(ts: Long): Unit = {
-    val (julianDay, timeOfDayNanos) = DateTimeUtils.toJulianDay(ts)
-    val buf = ByteBuffer.wrap(int96buf)
-    buf.order(ByteOrder.LITTLE_ENDIAN)
-    buf.putLong(timeOfDayNanos)
-    buf.putInt(julianDay)
-    writer.addBinary(Binary.fromByteArray(int96buf))
-  }
-}
-
-// Optimized for non-nested rows
-private[parquet] class MutableRowWriteSupport extends RowWriteSupport {
-  override def write(record: InternalRow): Unit = {
-    val attributesSize = attributes.size
-    if (attributesSize > record.numFields) {
-      throw new IndexOutOfBoundsException("Trying to write more fields than contained in row " +
-        s"($attributesSize > ${record.numFields})")
-    }
-
-    var index = 0
-    writer.startMessage()
-    while(index < attributesSize) {
-      // null values indicate optional fields but we do not check currently
-      if (!record.isNullAt(index) && !record.isNullAt(index)) {
-        writer.startField(attributes(index).name, index)
-        consumeType(attributes(index).dataType, record, index)
-        writer.endField(attributes(index).name, index)
-      }
-      index = index + 1
-    }
-    writer.endMessage()
-  }
-
-  private def consumeType(
-      ctype: DataType,
-      record: InternalRow,
-      index: Int): Unit = {
-    ctype match {
-      case BooleanType => writer.addBoolean(record.getBoolean(index))
-      case ByteType => writer.addInteger(record.getByte(index))
-      case ShortType => writer.addInteger(record.getShort(index))
-      case IntegerType | DateType => writer.addInteger(record.getInt(index))
-      case LongType => writer.addLong(record.getLong(index))
-      case TimestampType => writeTimestamp(record.getLong(index))
-      case FloatType => writer.addFloat(record.getFloat(index))
-      case DoubleType => writer.addDouble(record.getDouble(index))
-      case StringType =>
-        writer.addBinary(Binary.fromByteArray(record.getUTF8String(index).getBytes))
-      case BinaryType =>
-        writer.addBinary(Binary.fromByteArray(record.getBinary(index)))
-      case DecimalType.Fixed(precision, scale) =>
-        writeDecimal(record.getDecimal(index, precision, scale), precision)
-      case _ => sys.error(s"Unsupported datatype $ctype, cannot write to consumer")
-    }
-  }
-}
-
-private[parquet] object RowWriteSupport {
-  val SPARK_ROW_SCHEMA: String = "org.apache.spark.sql.parquet.row.attributes"
-
-  def getSchema(configuration: Configuration): Seq[Attribute] = {
-    val schemaString = configuration.get(RowWriteSupport.SPARK_ROW_SCHEMA)
-    if (schemaString == null) {
-      throw new RuntimeException("Missing schema!")
-    }
-    ParquetTypesConverter.convertFromString(schemaString)
-  }
-
-  def setSchema(schema: Seq[Attribute], configuration: Configuration) {
-    val encoded = ParquetTypesConverter.convertToString(schema)
-    configuration.set(SPARK_ROW_SCHEMA, encoded)
-    configuration.set(
-      ParquetOutputFormat.WRITER_VERSION,
-      ParquetProperties.WriterVersion.PARQUET_1_0.toString)
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
deleted file mode 100644
index b647bb6116afa..0000000000000
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTypesConverter.scala
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.execution.datasources.parquet
-
-import java.io.IOException
-import java.util.{Collections, Arrays}
-
-import scala.util.Try
-
-import org.apache.hadoop.conf.Configuration
-import org.apache.hadoop.fs.{FileSystem, Path}
-import org.apache.hadoop.mapreduce.Job
-import org.apache.parquet.format.converter.ParquetMetadataConverter
-import org.apache.parquet.hadoop.metadata.{FileMetaData, ParquetMetadata}
-import org.apache.parquet.hadoop.util.ContextUtil
-import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
-import org.apache.parquet.schema.MessageType
-
-import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.types._
-
-
-private[parquet] object ParquetTypesConverter extends Logging {
-  def isPrimitiveType(ctype: DataType): Boolean = ctype match {
-    case _: NumericType | BooleanType | DateType | TimestampType | StringType | BinaryType => true
-    case _ => false
-  }
-
-  /**
-   * Compute the FIXED_LEN_BYTE_ARRAY length needed to represent a given DECIMAL precision.
-   */
-  private[parquet] val BYTES_FOR_PRECISION = Array.tabulate[Int](38) { precision =>
-    var length = 1
-    while (math.pow(2.0, 8 * length - 1) < math.pow(10.0, precision)) {
-      length += 1
-    }
-    length
-  }
-
-  def convertFromAttributes(attributes: Seq[Attribute]): MessageType = {
-    val converter = new CatalystSchemaConverter()
-    converter.convert(StructType.fromAttributes(attributes))
-  }
-
-  def convertFromString(string: String): Seq[Attribute] = {
-    Try(DataType.fromJson(string)).getOrElse(DataType.fromCaseClassString(string)) match {
-      case s: StructType => s.toAttributes
-      case other => sys.error(s"Can convert $string to row")
-    }
-  }
-
-  def convertToString(schema: Seq[Attribute]): String = {
-    schema.map(_.name).foreach(CatalystSchemaConverter.checkFieldName)
-    StructType.fromAttributes(schema).json
-  }
-
-  def writeMetaData(attributes: Seq[Attribute], origPath: Path, conf: Configuration): Unit = {
-    if (origPath == null) {
-      throw new IllegalArgumentException("Unable to write Parquet metadata: path is null")
-    }
-    val fs = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(
-        s"Unable to write Parquet metadata: path $origPath is incorrectly formatted")
-    }
-    val path = origPath.makeQualified(fs)
-    if (fs.exists(path) && !fs.getFileStatus(path).isDir) {
-      throw new IllegalArgumentException(s"Expected to write to directory $path but found file")
-    }
-    val metadataPath = new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)
-    if (fs.exists(metadataPath)) {
-      try {
-        fs.delete(metadataPath, true)
-      } catch {
-        case e: IOException =>
-          throw new IOException(s"Unable to delete previous PARQUET_METADATA_FILE at $metadataPath")
-      }
-    }
-    val extraMetadata = new java.util.HashMap[String, String]()
-    extraMetadata.put(
-      CatalystReadSupport.SPARK_METADATA_KEY,
-      ParquetTypesConverter.convertToString(attributes))
-    // TODO: add extra data, e.g., table name, date, etc.?
-
-    val parquetSchema: MessageType = ParquetTypesConverter.convertFromAttributes(attributes)
-    val metaData: FileMetaData = new FileMetaData(
-      parquetSchema,
-      extraMetadata,
-      "Spark")
-
-    ParquetFileWriter.writeMetadataFile(
-      conf,
-      path,
-      Arrays.asList(new Footer(path, new ParquetMetadata(metaData, Collections.emptyList()))))
-  }
-
-  /**
-   * Try to read Parquet metadata at the given Path. We first see if there is a summary file
-   * in the parent directory. If so, this is used. Else we read the actual footer at the given
-   * location.
-   * @param origPath The path at which we expect one (or more) Parquet files.
-   * @param configuration The Hadoop configuration to use.
-   * @return The `ParquetMetadata` containing among other things the schema.
-   */
-  def readMetaData(origPath: Path, configuration: Option[Configuration]): ParquetMetadata = {
-    if (origPath == null) {
-      throw new IllegalArgumentException("Unable to read Parquet metadata: path is null")
-    }
-    val job = new Job()
-    val conf = {
-      // scalastyle:off jobcontext
-      configuration.getOrElse(ContextUtil.getConfiguration(job))
-      // scalastyle:on jobcontext
-    }
-    val fs: FileSystem = origPath.getFileSystem(conf)
-    if (fs == null) {
-      throw new IllegalArgumentException(s"Incorrectly formatted Parquet metadata path $origPath")
-    }
-    val path = origPath.makeQualified(fs)
-
-    val children =
-      fs
-        .globStatus(path)
-        .flatMap { status => if (status.isDir) fs.listStatus(status.getPath) else List(status) }
-        .filterNot { status =>
-          val name = status.getPath.getName
-          (name(0) == '.' || name(0) == '_') && name != ParquetFileWriter.PARQUET_METADATA_FILE
-        }
-
-    // NOTE (lian): Parquet "_metadata" file can be very slow if the file consists of lots of row
-    // groups. Since Parquet schema is replicated among all row groups, we only need to touch a
-    // single row group to read schema related metadata. Notice that we are making assumptions that
-    // all data in a single Parquet file have the same schema, which is normally true.
-    children
-      // Try any non-"_metadata" file first...
-      .find(_.getPath.getName != ParquetFileWriter.PARQUET_METADATA_FILE)
-      // ... and fallback to "_metadata" if no such file exists (which implies the Parquet file is
-      // empty, thus normally the "_metadata" file is expected to be fairly small).
-      .orElse(children.find(_.getPath.getName == ParquetFileWriter.PARQUET_METADATA_FILE))
-      .map(ParquetFileReader.readFooter(conf, _, ParquetMetadataConverter.NO_FILTER))
-      .getOrElse(
-        throw new IllegalArgumentException(s"Could not find Parquet metadata at path $path"))
-  }
-}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
index 7992fd59ff4ba..d17671d48a2fc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UserDefinedTypeSuite.scala
@@ -24,6 +24,7 @@ import com.clearspring.analytics.stream.cardinality.HyperLogLog
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
 import org.apache.spark.sql.catalyst.expressions.{OpenHashSetUDT, HyperLogLogUDT}
+import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types._
@@ -68,7 +69,7 @@ private[sql] class MyDenseVectorUDT extends UserDefinedType[MyDenseVector] {
   private[spark] override def asNullable: MyDenseVectorUDT = this
 }
 
-class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
+class UserDefinedTypeSuite extends QueryTest with SharedSQLContext with ParquetTest {
   import testImplicits._
 
   private lazy val pointsRDD = Seq(
@@ -98,17 +99,28 @@ class UserDefinedTypeSuite extends QueryTest with SharedSQLContext {
       Seq(Row(true), Row(true)))
   }
 
-
-  test("UDTs with Parquet") {
-    val tempDir = Utils.createTempDir()
-    tempDir.delete()
-    pointsRDD.write.parquet(tempDir.getCanonicalPath)
+  testStandardAndLegacyModes("UDTs with Parquet") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      pointsRDD.write.parquet(path)
+      checkAnswer(
+        sqlContext.read.parquet(path),
+        Seq(
+          Row(1.0, new MyDenseVector(Array(0.1, 1.0))),
+          Row(0.0, new MyDenseVector(Array(0.2, 2.0)))))
+    }
   }
 
-  test("Repartition UDTs with Parquet") {
-    val tempDir = Utils.createTempDir()
-    tempDir.delete()
-    pointsRDD.repartition(1).write.parquet(tempDir.getCanonicalPath)
+  testStandardAndLegacyModes("Repartition UDTs with Parquet") {
+    withTempPath { dir =>
+      val path = dir.getCanonicalPath
+      pointsRDD.repartition(1).write.parquet(path)
+      checkAnswer(
+        sqlContext.read.parquet(path),
+        Seq(
+          Row(1.0, new MyDenseVector(Array(0.1, 1.0))),
+          Row(0.0, new MyDenseVector(Array(0.2, 2.0)))))
+    }
   }
 
   // Tests to make sure that all operators correctly convert types on the way out.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index cd552e83372f1..599cf948e76a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -28,10 +28,10 @@ import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 import org.apache.parquet.example.data.simple.SimpleGroup
 import org.apache.parquet.example.data.{Group, GroupWriter}
+import org.apache.parquet.hadoop._
 import org.apache.parquet.hadoop.api.WriteSupport
 import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
-import org.apache.parquet.hadoop.metadata.{BlockMetaData, CompressionCodecName, FileMetaData, ParquetMetadata}
-import org.apache.parquet.hadoop.{Footer, ParquetFileWriter, ParquetOutputCommitter, ParquetWriter}
+import org.apache.parquet.hadoop.metadata.{CompressionCodecName, FileMetaData, ParquetMetadata}
 import org.apache.parquet.io.api.RecordConsumer
 import org.apache.parquet.schema.{MessageType, MessageTypeParser}
 
@@ -99,16 +99,18 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     withSQLConf(SQLConf.PARQUET_BINARY_AS_STRING.key -> "true")(checkParquetFile(data))
   }
 
-  test("fixed-length decimals") {
-    def makeDecimalRDD(decimal: DecimalType): DataFrame =
-      sparkContext
-        .parallelize(0 to 1000)
-        .map(i => Tuple1(i / 100.0))
-        .toDF()
-        // Parquet doesn't allow column names with spaces, have to add an alias here
-        .select($"_1" cast decimal as "dec")
+  testStandardAndLegacyModes("fixed-length decimals") {
+    def makeDecimalRDD(decimal: DecimalType): DataFrame = {
+      sqlContext
+        .range(1000)
+        // Parquet doesn't allow column names with spaces, have to add an alias here.
+        // Minus 500 here so that negative decimals are also tested.
+        .select((('id - 500) / 100.0) cast decimal as 'dec)
+        .coalesce(1)
+    }
 
-    for ((precision, scale) <- Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17), (19, 0), (38, 37))) {
+    val combinations = Seq((5, 2), (1, 0), (1, 1), (18, 10), (18, 17), (19, 0), (38, 37))
+    for ((precision, scale) <- combinations) {
       withTempPath { dir =>
         val data = makeDecimalRDD(DecimalType(precision, scale))
         data.write.parquet(dir.getCanonicalPath)
@@ -132,22 +134,22 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     }
   }
 
-  test("map") {
+  testStandardAndLegacyModes("map") {
     val data = (1 to 4).map(i => Tuple1(Map(i -> s"val_$i")))
     checkParquetFile(data)
   }
 
-  test("array") {
+  testStandardAndLegacyModes("array") {
     val data = (1 to 4).map(i => Tuple1(Seq(i, i + 1)))
     checkParquetFile(data)
   }
 
-  test("array and double") {
+  testStandardAndLegacyModes("array and double") {
     val data = (1 to 4).map(i => (i.toDouble, Seq(i.toDouble, (i + 1).toDouble)))
     checkParquetFile(data)
   }
 
-  test("struct") {
+  testStandardAndLegacyModes("struct") {
     val data = (1 to 4).map(i => Tuple1((i, s"val_$i")))
     withParquetDataFrame(data) { df =>
       // Structs are converted to `Row`s
@@ -157,7 +159,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     }
   }
 
-  test("nested struct with array of array as field") {
+  testStandardAndLegacyModes("nested struct with array of array as field") {
     val data = (1 to 4).map(i => Tuple1((i, Seq(Seq(s"val_$i")))))
     withParquetDataFrame(data) { df =>
       // Structs are converted to `Row`s
@@ -167,7 +169,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     }
   }
 
-  test("nested map with struct as value type") {
+  testStandardAndLegacyModes("nested map with struct as value type") {
     val data = (1 to 4).map(i => Tuple1(Map(i -> (i, s"val_$i"))))
     withParquetDataFrame(data) { df =>
       checkAnswer(df, data.map { case Tuple1(m) =>
@@ -205,14 +207,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
   }
 
   test("compression codec") {
-    def compressionCodecFor(path: String): String = {
-      val codecs = ParquetTypesConverter
-        .readMetaData(new Path(path), Some(hadoopConfiguration)).getBlocks.asScala
-        .flatMap(_.getColumns.asScala)
-        .map(_.getCodec.name())
-        .distinct
-
-      assert(codecs.size === 1)
+    def compressionCodecFor(path: String, codecName: String): String = {
+      val codecs = for {
+        footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConfiguration)
+        block <- footer.getParquetMetadata.getBlocks.asScala
+        column <- block.getColumns.asScala
+      } yield column.getCodec.name()
+
+      assert(codecs.distinct === Seq(codecName))
       codecs.head
     }
 
@@ -222,7 +224,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> codec.name()) {
         withParquetFile(data) { path =>
           assertResult(sqlContext.conf.parquetCompressionCodec.toUpperCase) {
-            compressionCodecFor(path)
+            compressionCodecFor(path, codec.name())
           }
         }
       }
@@ -278,15 +280,14 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
     withTempPath { file =>
       val path = new Path(file.toURI.toString)
       val fs = FileSystem.getLocal(hadoopConfiguration)
-      val attributes = ScalaReflection.attributesFor[(Int, String)]
-      ParquetTypesConverter.writeMetaData(attributes, path, hadoopConfiguration)
+      val schema = StructType.fromAttributes(ScalaReflection.attributesFor[(Int, String)])
+      writeMetadata(schema, path, hadoopConfiguration)
 
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE)))
       assert(fs.exists(new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE)))
 
-      val metaData = ParquetTypesConverter.readMetaData(path, Some(hadoopConfiguration))
-      val actualSchema = metaData.getFileMetaData.getSchema
-      val expectedSchema = ParquetTypesConverter.convertFromAttributes(attributes)
+      val expectedSchema = new CatalystSchemaConverter().convert(schema)
+      val actualSchema = readFooter(path, hadoopConfiguration).getFileMetaData.getSchema
 
       actualSchema.checkContains(expectedSchema)
       expectedSchema.checkContains(actualSchema)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index 1c1cfa34ad04b..cc02ef81c9f8b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -484,7 +484,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
     }
   }
 
-  test("SPARK-10301 requested schema clipping - UDT") {
+  testStandardAndLegacyModes("SPARK-10301 requested schema clipping - UDT") {
     withTempPath { dir =>
       val path = dir.getCanonicalPath
 
@@ -517,6 +517,50 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
         Row(Row(NestedStruct(1, 2L, 3.5D))))
     }
   }
+
+  test("expand UDT in StructType") {
+    val schema = new StructType().add("n", new NestedStructUDT, nullable = true)
+    val expected = new StructType().add("n", new NestedStructUDT().sqlType, nullable = true)
+    assert(CatalystReadSupport.expandUDT(schema) === expected)
+  }
+
+  test("expand UDT in ArrayType") {
+    val schema = new StructType().add(
+      "n",
+      ArrayType(
+        elementType = new NestedStructUDT,
+        containsNull = false),
+      nullable = true)
+
+    val expected = new StructType().add(
+      "n",
+      ArrayType(
+        elementType = new NestedStructUDT().sqlType,
+        containsNull = false),
+      nullable = true)
+
+    assert(CatalystReadSupport.expandUDT(schema) === expected)
+  }
+
+  test("expand UDT in MapType") {
+    val schema = new StructType().add(
+      "n",
+      MapType(
+        keyType = IntegerType,
+        valueType = new NestedStructUDT,
+        valueContainsNull = false),
+      nullable = true)
+
+    val expected = new StructType().add(
+      "n",
+      MapType(
+        keyType = IntegerType,
+        valueType = new NestedStructUDT().sqlType,
+        valueContainsNull = false),
+      nullable = true)
+
+    assert(CatalystReadSupport.expandUDT(schema) === expected)
+  }
 }
 
 object TestingUDT {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
index f17fb36f25fe8..60fa81b1ab819 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala
@@ -357,8 +357,8 @@ class ParquetSchemaSuite extends ParquetSchemaTest {
     val jsonString = """{"type":"struct","fields":[{"name":"c1","type":"integer","nullable":false,"metadata":{}},{"name":"c2","type":"binary","nullable":true,"metadata":{}}]}"""
     // scalastyle:on
 
-    val fromCaseClassString = ParquetTypesConverter.convertFromString(caseClassString)
-    val fromJson = ParquetTypesConverter.convertFromString(jsonString)
+    val fromCaseClassString = StructType.fromString(caseClassString)
+    val fromJson = StructType.fromString(jsonString)
 
     (fromCaseClassString, fromJson).zipped.foreach { (a, b) =>
       assert(a.name == b.name)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
index 442fafb12f200..9840ad919e510 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
@@ -19,11 +19,19 @@ package org.apache.spark.sql.execution.datasources.parquet
 
 import java.io.File
 
+import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.TypeTag
 
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
+import org.apache.parquet.format.converter.ParquetMetadataConverter
+import org.apache.parquet.hadoop.metadata.{BlockMetaData, FileMetaData, ParquetMetadata}
+import org.apache.parquet.hadoop.{Footer, ParquetFileReader, ParquetFileWriter}
+
 import org.apache.spark.sql.test.SQLTestUtils
-import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.{DataFrame, SQLConf, SaveMode}
 
 /**
  * A helper trait that provides convenient facilities for Parquet testing.
@@ -97,4 +105,38 @@ private[sql] trait ParquetTest extends SQLTestUtils {
     assert(partDir.mkdirs(), s"Couldn't create directory $partDir")
     partDir
   }
+
+  protected def writeMetadata(
+      schema: StructType, path: Path, configuration: Configuration): Unit = {
+    val parquetSchema = new CatalystSchemaConverter().convert(schema)
+    val extraMetadata = Map(CatalystReadSupport.SPARK_METADATA_KEY -> schema.json).asJava
+    val createdBy = s"Apache Spark ${org.apache.spark.SPARK_VERSION}"
+    val fileMetadata = new FileMetaData(parquetSchema, extraMetadata, createdBy)
+    val parquetMetadata = new ParquetMetadata(fileMetadata, Seq.empty[BlockMetaData].asJava)
+    val footer = new Footer(path, parquetMetadata)
+    ParquetFileWriter.writeMetadataFile(configuration, path, Seq(footer).asJava)
+  }
+
+  protected def readAllFootersWithoutSummaryFiles(
+      path: Path, configuration: Configuration): Seq[Footer] = {
+    val fs = path.getFileSystem(configuration)
+    ParquetFileReader.readAllFootersInParallel(configuration, fs.getFileStatus(path)).asScala.toSeq
+  }
+
+  protected def readFooter(path: Path, configuration: Configuration): ParquetMetadata = {
+    ParquetFileReader.readFooter(
+      configuration,
+      new Path(path, ParquetFileWriter.PARQUET_METADATA_FILE),
+      ParquetMetadataConverter.NO_FILTER)
+  }
+
+  protected def testStandardAndLegacyModes(testName: String)(f: => Unit): Unit = {
+    test(s"Standard mode - $testName") {
+      withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "false") { f }
+    }
+
+    test(s"Legacy mode - $testName") {
+      withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") { f }
+    }
+  }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 107457f79ec03..d63f3d3996523 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -20,11 +20,11 @@ package org.apache.spark.sql.hive
 import java.io.File
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.{QueryTest, Row, SaveMode}
 import org.apache.spark.sql.hive.client.{ExternalTable, ManagedTable}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils}
 import org.apache.spark.sql.types.{DecimalType, StringType, StructType}
+import org.apache.spark.sql.{SQLConf, QueryTest, Row, SaveMode}
 
 class HiveMetastoreCatalogSuite extends SparkFunSuite with TestHiveSingleton {
   import hiveContext.implicits._
@@ -74,11 +74,13 @@ class DataSourceWithHiveMetastoreCatalogSuite
   ).foreach { case (provider, (inputFormat, outputFormat, serde)) =>
     test(s"Persist non-partitioned $provider relation into metastore as managed table") {
       withTable("t") {
-        testDF
-          .write
-          .mode(SaveMode.Overwrite)
-          .format(provider)
-          .saveAsTable("t")
+        withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
+          testDF
+            .write
+            .mode(SaveMode.Overwrite)
+            .format(provider)
+            .saveAsTable("t")
+        }
 
         val hiveTable = catalog.client.getTable("default", "t")
         assert(hiveTable.inputFormat === Some(inputFormat))
@@ -102,12 +104,14 @@ class DataSourceWithHiveMetastoreCatalogSuite
         withTable("t") {
           val path = dir.getCanonicalFile
 
-          testDF
-            .write
-            .mode(SaveMode.Overwrite)
-            .format(provider)
-            .option("path", path.toString)
-            .saveAsTable("t")
+          withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") {
+            testDF
+              .write
+              .mode(SaveMode.Overwrite)
+              .format(provider)
+              .option("path", path.toString)
+              .saveAsTable("t")
+          }
 
           val hiveTable = catalog.client.getTable("default", "t")
           assert(hiveTable.inputFormat === Some(inputFormat))

From 84ea287178247c163226e835490c9c70b17d8d3b Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Thu, 8 Oct 2015 17:25:14 -0700
Subject: [PATCH 006/139] [SPARK-10914] UnsafeRow serialization breaks when two
 machines have different Oops size.

UnsafeRow contains 3 pieces of information when pointing to some data in memory (an object, a base offset, and length). When the row is serialized with Java/Kryo serialization, the object layout in memory can change if two machines have different pointer width (Oops in JVM).

To reproduce, launch Spark using

MASTER=local-cluster[2,1,1024] bin/spark-shell --conf "spark.executor.extraJavaOptions=-XX:-UseCompressedOops"

And then run the following

scala> sql("select 1 xx").collect()

Author: Reynold Xin <rxin@databricks.com>

Closes #9030 from rxin/SPARK-10914.
---
 .../sql/catalyst/expressions/UnsafeRow.java   | 47 +++++++++++++++++--
 .../org/apache/spark/sql/UnsafeRowSuite.scala | 29 +++++++++++-
 2 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index e8ac2999c2d29..5af7ed5d6eb6d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -17,8 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
-import java.io.IOException;
-import java.io.OutputStream;
+import java.io.*;
 import java.math.BigDecimal;
 import java.math.BigInteger;
 import java.util.Arrays;
@@ -26,6 +25,11 @@
 import java.util.HashSet;
 import java.util.Set;
 
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.KryoSerializable;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
+
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.array.ByteArrayMethods;
@@ -35,6 +39,7 @@
 import org.apache.spark.unsafe.types.UTF8String;
 
 import static org.apache.spark.sql.types.DataTypes.*;
+import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
 
 /**
  * An Unsafe implementation of Row which is backed by raw memory instead of Java objects.
@@ -52,7 +57,7 @@
  *
  * Instances of `UnsafeRow` act as pointers to row data stored in this format.
  */
-public final class UnsafeRow extends MutableRow {
+public final class UnsafeRow extends MutableRow implements Externalizable, KryoSerializable {
 
   //////////////////////////////////////////////////////////////////////////////
   // Static methods
@@ -596,4 +601,40 @@ public boolean anyNull() {
   public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
   }
+
+  @Override
+  public void writeExternal(ObjectOutput out) throws IOException {
+    byte[] bytes = getBytes();
+    out.writeInt(bytes.length);
+    out.writeInt(this.numFields);
+    out.write(bytes);
+  }
+
+  @Override
+  public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+    this.baseOffset = BYTE_ARRAY_OFFSET;
+    this.sizeInBytes = in.readInt();
+    this.numFields = in.readInt();
+    this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
+    this.baseObject = new byte[sizeInBytes];
+    in.readFully((byte[]) baseObject);
+  }
+
+  @Override
+  public void write(Kryo kryo, Output out) {
+    byte[] bytes = getBytes();
+    out.writeInt(bytes.length);
+    out.writeInt(this.numFields);
+    out.write(bytes);
+  }
+
+  @Override
+  public void read(Kryo kryo, Input in) {
+    this.baseOffset = BYTE_ARRAY_OFFSET;
+    this.sizeInBytes = in.readInt();
+    this.numFields = in.readInt();
+    this.bitSetWidthInBytes = calculateBitSetWidthInBytes(numFields);
+    this.baseObject = new byte[sizeInBytes];
+    in.read((byte[]) baseObject);
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
index 944d4e11348cf..7d1ee39d4b539 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/UnsafeRowSuite.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql
 
 import java.io.ByteArrayOutputStream
 
-import org.apache.spark.SparkFunSuite
+import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.serializer.{KryoSerializer, JavaSerializer}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{UnsafeRow, UnsafeProjection}
 import org.apache.spark.sql.types._
@@ -29,6 +30,32 @@ import org.apache.spark.unsafe.types.UTF8String
 
 class UnsafeRowSuite extends SparkFunSuite {
 
+  test("UnsafeRow Java serialization") {
+    // serializing an UnsafeRow pointing to a large buffer should only serialize the relevant data
+    val data = new Array[Byte](1024)
+    val row = new UnsafeRow
+    row.pointTo(data, 1, 16)
+    row.setLong(0, 19285)
+
+    val ser = new JavaSerializer(new SparkConf).newInstance()
+    val row1 = ser.deserialize[UnsafeRow](ser.serialize(row))
+    assert(row1.getLong(0) == 19285)
+    assert(row1.getBaseObject().asInstanceOf[Array[Byte]].length == 16)
+  }
+
+  test("UnsafeRow Kryo serialization") {
+    // serializing an UnsafeRow pointing to a large buffer should only serialize the relevant data
+    val data = new Array[Byte](1024)
+    val row = new UnsafeRow
+    row.pointTo(data, 1, 16)
+    row.setLong(0, 19285)
+
+    val ser = new KryoSerializer(new SparkConf).newInstance()
+    val row1 = ser.deserialize[UnsafeRow](ser.serialize(row))
+    assert(row1.getLong(0) == 19285)
+    assert(row1.getBaseObject().asInstanceOf[Array[Byte]].length == 16)
+  }
+
   test("bitset width calculation") {
     assert(UnsafeRow.calculateBitSetWidthInBytes(0) === 0)
     assert(UnsafeRow.calculateBitSetWidthInBytes(1) === 8)

From 3390b400d04e40f767d8a51f1078fcccb4e64abd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Thu, 8 Oct 2015 17:34:24 -0700
Subject: [PATCH 007/139] [SPARK-10810] [SPARK-10902] [SQL] Improve session
 management in SQL

This PR improve the sessions management by replacing the thread-local based to one SQLContext per session approach, introduce separated temporary tables and UDFs/UDAFs for each session.

A new session of SQLContext could be created by:

1) create an new SQLContext
2) call newSession() on existing SQLContext

For HiveContext, in order to reduce the cost for each session, the classloader and Hive client are shared across multiple sessions (created by newSession).

CacheManager is also shared by multiple sessions, so cache a table multiple times in different sessions will not cause multiple copies of in-memory cache.

Added jars are still shared by all the sessions, because SparkContext does not support sessions.

cc marmbrus yhuai rxin

Author: Davies Liu <davies@databricks.com>

Closes #8909 from davies/sessions.
---
 project/MimaExcludes.scala                    |  22 ++-
 .../catalyst/analysis/FunctionRegistry.scala  |  28 ++-
 .../org/apache/spark/sql/SQLContext.scala     | 164 ++++++++++--------
 .../spark/sql/execution/CacheManager.scala    |  14 +-
 .../apache/spark/sql/SQLContextSuite.scala    |  59 ++++---
 .../spark/sql/test/TestSQLContext.scala       |  21 +--
 .../SparkExecuteStatementOperation.scala      |  76 ++------
 .../thriftserver/SparkSQLSessionManager.scala |   9 +-
 .../server/SparkSQLOperationManager.scala     |   5 +-
 .../sql/hive/thriftserver/CliSuite.scala      |   8 +-
 .../HiveThriftServer2Suites.scala             |  76 ++++----
 .../apache/spark/sql/hive/HiveContext.scala   | 155 +++++++++++------
 .../org/apache/spark/sql/hive/HiveQl.scala    |  28 +--
 .../sql/hive/client/ClientInterface.scala     |   9 +
 .../spark/sql/hive/client/ClientWrapper.scala |  85 +++++----
 .../hive/client/IsolatedClientLoader.scala    | 107 +++++++-----
 .../spark/sql/hive/execution/commands.scala   |  27 +--
 .../apache/spark/sql/hive/test/TestHive.scala |  27 +--
 .../apache/spark/sql/hive/HiveQlSuite.scala   |  13 +-
 .../spark/sql/hive/client/VersionsSuite.scala |   6 +-
 .../sql/hive/execution/HiveQuerySuite.scala   |  32 ++++
 .../sql/hive/execution/SQLQuerySuite.scala    |   9 +-
 22 files changed, 540 insertions(+), 440 deletions(-)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 2d4d146f51339..08e4a449cf762 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -79,7 +79,27 @@ object MimaExcludes {
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.ml.regression.LeastSquaresAggregator.add"),
         ProblemFilters.exclude[MissingMethodProblem](
-          "org.apache.spark.ml.regression.LeastSquaresCostFun.this")
+          "org.apache.spark.ml.regression.LeastSquaresCostFun.this"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.clearLastInstantiatedContext"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.setLastInstantiatedContext"),
+        ProblemFilters.exclude[MissingClassProblem](
+          "org.apache.spark.sql.SQLContext$SQLSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.detachSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.tlSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.defaultSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.currentSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.openSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.setSession"),
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.sql.SQLContext.createSession")
       )
     case v if v.startsWith("1.5") =>
       Seq(
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index e6122d92b763c..ba77b70a378a6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -51,23 +51,37 @@ class SimpleFunctionRegistry extends FunctionRegistry {
   private val functionBuilders =
     StringKeyHashMap[(ExpressionInfo, FunctionBuilder)](caseSensitive = false)
 
-  override def registerFunction(name: String, info: ExpressionInfo, builder: FunctionBuilder)
-  : Unit = {
+  override def registerFunction(
+      name: String,
+      info: ExpressionInfo,
+      builder: FunctionBuilder): Unit = synchronized {
     functionBuilders.put(name, (info, builder))
   }
 
   override def lookupFunction(name: String, children: Seq[Expression]): Expression = {
-    val func = functionBuilders.get(name).map(_._2).getOrElse {
-      throw new AnalysisException(s"undefined function $name")
+    val func = synchronized {
+      functionBuilders.get(name).map(_._2).getOrElse {
+        throw new AnalysisException(s"undefined function $name")
+      }
     }
     func(children)
   }
 
-  override def listFunction(): Seq[String] = functionBuilders.iterator.map(_._1).toList.sorted
+  override def listFunction(): Seq[String] = synchronized {
+    functionBuilders.iterator.map(_._1).toList.sorted
+  }
 
-  override def lookupFunction(name: String): Option[ExpressionInfo] = {
+  override def lookupFunction(name: String): Option[ExpressionInfo] = synchronized {
     functionBuilders.get(name).map(_._1)
   }
+
+  def copy(): SimpleFunctionRegistry = synchronized {
+    val registry = new SimpleFunctionRegistry
+    functionBuilders.iterator.foreach { case (name, (info, builder)) =>
+      registry.registerFunction(name, info, builder)
+    }
+    registry
+  }
 }
 
 /**
@@ -257,7 +271,7 @@ object FunctionRegistry {
     expression[InputFileName]("input_file_name")
   )
 
-  val builtin: FunctionRegistry = {
+  val builtin: SimpleFunctionRegistry = {
     val fr = new SimpleFunctionRegistry
     expressions.foreach { case (name, (info, builder)) => fr.registerFunction(name, info, builder) }
     fr
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index cb0a3e361c97a..2bdfd82af0adb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -30,6 +30,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
+import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd}
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.errors.DialectException
@@ -38,15 +39,12 @@ import org.apache.spark.sql.catalyst.optimizer.{DefaultOptimizer, Optimizer}
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, _}
-import org.apache.spark.sql.execution.{Filter, _}
-import org.apache.spark.sql.{execution => sparkexecution}
-import org.apache.spark.sql.execution._
-import org.apache.spark.sql.sources._
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.execution.datasources._
 import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.{execution => sparkexecution}
 import org.apache.spark.util.Utils
 
 /**
@@ -64,18 +62,30 @@ import org.apache.spark.util.Utils
  *
  * @since 1.0.0
  */
-class SQLContext(@transient val sparkContext: SparkContext)
-  extends org.apache.spark.Logging
-  with Serializable {
+class SQLContext private[sql](
+    @transient val sparkContext: SparkContext,
+    @transient protected[sql] val cacheManager: CacheManager)
+  extends org.apache.spark.Logging with Serializable {
 
   self =>
 
+  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager)
   def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
 
+  /**
+   * Returns a SQLContext as new session, with separated SQL configurations, temporary tables,
+   * registered functions, but sharing the same SparkContext and CacheManager.
+   *
+   * @since 1.6.0
+   */
+  def newSession(): SQLContext = {
+    new SQLContext(sparkContext, cacheManager)
+  }
+
   /**
    * @return Spark SQL configuration
    */
-  protected[sql] def conf = currentSession().conf
+  protected[sql] lazy val conf = new SQLConf
 
   // `listener` should be only used in the driver
   @transient private[sql] val listener = new SQLListener(this)
@@ -142,13 +152,11 @@ class SQLContext(@transient val sparkContext: SparkContext)
    */
   def getAllConfs: immutable.Map[String, String] = conf.getAllConfs
 
-  // TODO how to handle the temp table per user session?
   @transient
   protected[sql] lazy val catalog: Catalog = new SimpleCatalog(conf)
 
-  // TODO how to handle the temp function per user session?
   @transient
-  protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin
+  protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin.copy()
 
   @transient
   protected[sql] lazy val analyzer: Analyzer =
@@ -198,20 +206,19 @@ class SQLContext(@transient val sparkContext: SparkContext)
   protected[sql] def executePlan(plan: LogicalPlan) =
     new sparkexecution.QueryExecution(this, plan)
 
-  @transient
-  protected[sql] val tlSession = new ThreadLocal[SQLSession]() {
-    override def initialValue: SQLSession = defaultSession
-  }
-
-  @transient
-  protected[sql] val defaultSession = createSession()
-
   protected[sql] def dialectClassName = if (conf.dialect == "sql") {
     classOf[DefaultParserDialect].getCanonicalName
   } else {
     conf.dialect
   }
 
+  /**
+   * Add a jar to SQLContext
+   */
+  protected[sql] def addJar(path: String): Unit = {
+    sparkContext.addJar(path)
+  }
+
   {
     // We extract spark sql settings from SparkContext's conf and put them to
     // Spark SQL's conf.
@@ -236,9 +243,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
     }
   }
 
-  @transient
-  protected[sql] val cacheManager = new CacheManager(this)
-
   /**
    * :: Experimental ::
    * A collection of methods that are considered experimental, but can be used to hook into
@@ -300,21 +304,25 @@ class SQLContext(@transient val sparkContext: SparkContext)
    * @group cachemgmt
    * @since 1.3.0
    */
-  def isCached(tableName: String): Boolean = cacheManager.isCached(tableName)
+  def isCached(tableName: String): Boolean = {
+    cacheManager.lookupCachedData(table(tableName)).nonEmpty
+  }
 
   /**
    * Caches the specified table in-memory.
    * @group cachemgmt
    * @since 1.3.0
    */
-  def cacheTable(tableName: String): Unit = cacheManager.cacheTable(tableName)
+  def cacheTable(tableName: String): Unit = {
+    cacheManager.cacheQuery(table(tableName), Some(tableName))
+  }
 
   /**
    * Removes the specified table from the in-memory cache.
    * @group cachemgmt
    * @since 1.3.0
    */
-  def uncacheTable(tableName: String): Unit = cacheManager.uncacheTable(tableName)
+  def uncacheTable(tableName: String): Unit = cacheManager.uncacheQuery(table(tableName))
 
   /**
    * Removes all cached tables from the in-memory cache.
@@ -830,36 +838,6 @@ class SQLContext(@transient val sparkContext: SparkContext)
     )
   }
 
-  protected[sql] def openSession(): SQLSession = {
-    detachSession()
-    val session = createSession()
-    tlSession.set(session)
-
-    session
-  }
-
-  protected[sql] def currentSession(): SQLSession = {
-    tlSession.get()
-  }
-
-  protected[sql] def createSession(): SQLSession = {
-    new this.SQLSession()
-  }
-
-  protected[sql] def detachSession(): Unit = {
-    tlSession.remove()
-  }
-
-  protected[sql] def setSession(session: SQLSession): Unit = {
-    detachSession()
-    tlSession.set(session)
-  }
-
-  protected[sql] class SQLSession {
-    // Note that this is a lazy val so we can override the default value in subclasses.
-    protected[sql] lazy val conf: SQLConf = new SQLConf
-  }
-
   @deprecated("use org.apache.spark.sql.QueryExecution", "1.6.0")
   protected[sql] class QueryExecution(logical: LogicalPlan)
     extends sparkexecution.QueryExecution(this, logical)
@@ -1196,46 +1174,90 @@ class SQLContext(@transient val sparkContext: SparkContext)
   // Register a succesfully instantiatd context to the singleton. This should be at the end of
   // the class definition so that the singleton is updated only if there is no exception in the
   // construction of the instance.
-  SQLContext.setLastInstantiatedContext(self)
+  sparkContext.addSparkListener(new SparkListener {
+    override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
+      SQLContext.clearInstantiatedContext(self)
+    }
+  })
+
+  SQLContext.setInstantiatedContext(self)
 }
 
 /**
  * This SQLContext object contains utility functions to create a singleton SQLContext instance,
- * or to get the last created SQLContext instance.
+ * or to get the created SQLContext instance.
+ *
+ * It also provides utility functions to support preference for threads in multiple sessions
+ * scenario, setActive could set a SQLContext for current thread, which will be returned by
+ * getOrCreate instead of the global one.
  */
 object SQLContext {
 
-  private val INSTANTIATION_LOCK = new Object()
+  /**
+   * The active SQLContext for the current thread.
+   */
+  private val activeContext: InheritableThreadLocal[SQLContext] =
+    new InheritableThreadLocal[SQLContext]
 
   /**
-   * Reference to the last created SQLContext.
+   * Reference to the created SQLContext.
    */
-  @transient private val lastInstantiatedContext = new AtomicReference[SQLContext]()
+  @transient private val instantiatedContext = new AtomicReference[SQLContext]()
 
   /**
    * Get the singleton SQLContext if it exists or create a new one using the given SparkContext.
+   *
    * This function can be used to create a singleton SQLContext object that can be shared across
    * the JVM.
+   *
+   * If there is an active SQLContext for current thread, it will be returned instead of the global
+   * one.
+   *
+   * @since 1.5.0
    */
   def getOrCreate(sparkContext: SparkContext): SQLContext = {
-    INSTANTIATION_LOCK.synchronized {
-      if (lastInstantiatedContext.get() == null) {
+    val ctx = activeContext.get()
+    if (ctx != null) {
+      return ctx
+    }
+
+    synchronized {
+      val ctx = instantiatedContext.get()
+      if (ctx == null) {
         new SQLContext(sparkContext)
+      } else {
+        ctx
       }
     }
-    lastInstantiatedContext.get()
   }
 
-  private[sql] def clearLastInstantiatedContext(): Unit = {
-    INSTANTIATION_LOCK.synchronized {
-      lastInstantiatedContext.set(null)
-    }
+  private[sql] def clearInstantiatedContext(sqlContext: SQLContext): Unit = {
+    instantiatedContext.compareAndSet(sqlContext, null)
   }
 
-  private[sql] def setLastInstantiatedContext(sqlContext: SQLContext): Unit = {
-    INSTANTIATION_LOCK.synchronized {
-      lastInstantiatedContext.set(sqlContext)
-    }
+  private[sql] def setInstantiatedContext(sqlContext: SQLContext): Unit = {
+    instantiatedContext.compareAndSet(null, sqlContext)
+  }
+
+  /**
+   * Changes the SQLContext that will be returned in this thread and its children when
+   * SQLContext.getOrCreate() is called. This can be used to ensure that a given thread receives
+   * a SQLContext with an isolated session, instead of the global (first created) context.
+   *
+   * @since 1.6.0
+   */
+  def setActive(sqlContext: SQLContext): Unit = {
+    activeContext.set(sqlContext)
+  }
+
+  /**
+   * Clears the active SQLContext for current thread. Subsequent calls to getOrCreate will
+   * return the first created context instead of a thread-local override.
+   *
+   * @since 1.6.0
+   */
+  def clearActive(): Unit = {
+    activeContext.remove()
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
index d3e5c378d037d..f85aeb1b02694 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CacheManager.scala
@@ -20,9 +20,9 @@ package org.apache.spark.sql.execution
 import java.util.concurrent.locks.ReentrantReadWriteLock
 
 import org.apache.spark.Logging
+import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.columnar.InMemoryRelation
-import org.apache.spark.sql.{DataFrame, SQLContext}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.storage.StorageLevel.MEMORY_AND_DISK
 
@@ -37,7 +37,7 @@ private[sql] case class CachedData(plan: LogicalPlan, cachedRepresentation: InMe
  *
  * Internal to Spark SQL.
  */
-private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
+private[sql] class CacheManager extends Logging {
 
   @transient
   private val cachedData = new scala.collection.mutable.ArrayBuffer[CachedData]
@@ -45,15 +45,6 @@ private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
   @transient
   private val cacheLock = new ReentrantReadWriteLock
 
-  /** Returns true if the table is currently cached in-memory. */
-  def isCached(tableName: String): Boolean = lookupCachedData(sqlContext.table(tableName)).nonEmpty
-
-  /** Caches the specified table in-memory. */
-  def cacheTable(tableName: String): Unit = cacheQuery(sqlContext.table(tableName), Some(tableName))
-
-  /** Removes the specified table from the in-memory cache. */
-  def uncacheTable(tableName: String): Unit = uncacheQuery(sqlContext.table(tableName))
-
   /** Acquires a read lock on the cache for the duration of `f`. */
   private def readLock[A](f: => A): A = {
     val lock = cacheLock.readLock()
@@ -96,6 +87,7 @@ private[sql] class CacheManager(sqlContext: SQLContext) extends Logging {
     if (lookupCachedData(planToCache).nonEmpty) {
       logWarning("Asked to cache already cached data.")
     } else {
+      val sqlContext = query.sqlContext
       cachedData +=
         CachedData(
           planToCache,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
index dd88ae3700ab9..1994dacfc4dfa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLContextSuite.scala
@@ -17,33 +17,52 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.test.SharedSQLContext
+import org.apache.spark.{SharedSparkContext, SparkFunSuite}
 
-class SQLContextSuite extends SparkFunSuite with SharedSQLContext {
-
-  override def afterAll(): Unit = {
-    try {
-      SQLContext.setLastInstantiatedContext(sqlContext)
-    } finally {
-      super.afterAll()
-    }
-  }
+class SQLContextSuite extends SparkFunSuite with SharedSparkContext{
 
   test("getOrCreate instantiates SQLContext") {
-    SQLContext.clearLastInstantiatedContext()
-    val sqlContext = SQLContext.getOrCreate(sparkContext)
+    val sqlContext = SQLContext.getOrCreate(sc)
     assert(sqlContext != null, "SQLContext.getOrCreate returned null")
-    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
+    assert(SQLContext.getOrCreate(sc).eq(sqlContext),
       "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate")
   }
 
-  test("getOrCreate gets last explicitly instantiated SQLContext") {
-    SQLContext.clearLastInstantiatedContext()
-    val sqlContext = new SQLContext(sparkContext)
-    assert(SQLContext.getOrCreate(sparkContext) != null,
-      "SQLContext.getOrCreate after explicitly created SQLContext returned null")
-    assert(SQLContext.getOrCreate(sparkContext).eq(sqlContext),
+  test("getOrCreate return the original SQLContext") {
+    val sqlContext = SQLContext.getOrCreate(sc)
+    val newSession = sqlContext.newSession()
+    assert(SQLContext.getOrCreate(sc).eq(sqlContext),
       "SQLContext.getOrCreate after explicitly created SQLContext did not return the context")
+    SQLContext.setActive(newSession)
+    assert(SQLContext.getOrCreate(sc).eq(newSession),
+      "SQLContext.getOrCreate after explicitly setActive() did not return the active context")
+  }
+
+  test("Sessions of SQLContext") {
+    val sqlContext = SQLContext.getOrCreate(sc)
+    val session1 = sqlContext.newSession()
+    val session2 = sqlContext.newSession()
+
+    // all have the default configurations
+    val key = SQLConf.SHUFFLE_PARTITIONS.key
+    assert(session1.getConf(key) === session2.getConf(key))
+    session1.setConf(key, "1")
+    session2.setConf(key, "2")
+    assert(session1.getConf(key) === "1")
+    assert(session2.getConf(key) === "2")
+
+    // temporary table should not be shared
+    val df = session1.range(10)
+    df.registerTempTable("test1")
+    assert(session1.tableNames().contains("test1"))
+    assert(!session2.tableNames().contains("test1"))
+
+    // UDF should not be shared
+    def myadd(a: Int, b: Int): Int = a + b
+    session1.udf.register[Int, Int, Int]("myadd", myadd)
+    session1.sql("select myadd(1, 2)").explain()
+    intercept[AnalysisException] {
+      session2.sql("select myadd(1, 2)").explain()
+    }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
index 10e633f3cde46..c89a1516503e0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/TestSQLContext.scala
@@ -31,23 +31,16 @@ private[sql] class TestSQLContext(sc: SparkContext) extends SQLContext(sc) { sel
       new SparkConf().set("spark.sql.testkey", "true")))
   }
 
-  // Make sure we set those test specific confs correctly when we create
-  // the SQLConf as well as when we call clear.
-  protected[sql] override def createSession(): SQLSession = new this.SQLSession()
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
 
-  /** A special [[SQLSession]] that uses fewer shuffle partitions than normal. */
-  protected[sql] class SQLSession extends super.SQLSession {
-    protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    clear()
 
-      clear()
+    override def clear(): Unit = {
+      super.clear()
 
-      override def clear(): Unit = {
-        super.clear()
-
-        // Make sure we start with the default test configs even after clear
-        TestSQLContext.overrideConfs.map {
-          case (key, value) => setConfString(key, value)
-        }
+      // Make sure we start with the default test configs even after clear
+      TestSQLContext.overrideConfs.map {
+        case (key, value) => setConfString(key, value)
       }
     }
   }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index 306f98bcb5344..719b03e1c7c71 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -20,19 +20,15 @@ package org.apache.spark.sql.hive.thriftserver
 import java.security.PrivilegedExceptionAction
 import java.sql.{Date, Timestamp}
 import java.util.concurrent.RejectedExecutionException
-import java.util.{Arrays, Map => JMap, UUID}
+import java.util.{Arrays, UUID, Map => JMap}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, Map => SMap}
 import scala.util.control.NonFatal
 
-import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.metastore.api.FieldSchema
-import org.apache.hive.service.cli._
-import org.apache.hadoop.hive.ql.metadata.Hive
-import org.apache.hadoop.hive.ql.metadata.HiveException
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.shims.Utils
+import org.apache.hive.service.cli._
 import org.apache.hive.service.cli.operation.ExecuteStatementOperation
 import org.apache.hive.service.cli.session.HiveSession
 
@@ -40,7 +36,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.execution.SetCommand
 import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.{DataFrame, Row => SparkRow, SQLConf}
+import org.apache.spark.sql.{DataFrame, SQLConf, Row => SparkRow}
 
 
 private[hive] class SparkExecuteStatementOperation(
@@ -143,30 +139,15 @@ private[hive] class SparkExecuteStatementOperation(
     if (!runInBackground) {
       runInternal()
     } else {
-      val parentSessionState = SessionState.get()
-      val hiveConf = getConfigForOperation()
       val sparkServiceUGI = Utils.getUGI()
-      val sessionHive = getCurrentHive()
-      val currentSqlSession = hiveContext.currentSession
 
       // Runnable impl to call runInternal asynchronously,
       // from a different thread
       val backgroundOperation = new Runnable() {
 
         override def run(): Unit = {
-          val doAsAction = new PrivilegedExceptionAction[Object]() {
-            override def run(): Object = {
-
-              // User information is part of the metastore client member in Hive
-              hiveContext.setSession(currentSqlSession)
-              // Always use the latest class loader provided by executionHive's state.
-              val executionHiveClassLoader =
-                hiveContext.executionHive.state.getConf.getClassLoader
-              sessionHive.getConf.setClassLoader(executionHiveClassLoader)
-              parentSessionState.getConf.setClassLoader(executionHiveClassLoader)
-
-              Hive.set(sessionHive)
-              SessionState.setCurrentSessionState(parentSessionState)
+          val doAsAction = new PrivilegedExceptionAction[Unit]() {
+            override def run(): Unit = {
               try {
                 runInternal()
               } catch {
@@ -174,7 +155,6 @@ private[hive] class SparkExecuteStatementOperation(
                   setOperationException(e)
                   log.error("Error running hive query: ", e)
               }
-              return null
             }
           }
 
@@ -191,7 +171,7 @@ private[hive] class SparkExecuteStatementOperation(
       try {
         // This submit blocks if no background threads are available to run this operation
         val backgroundHandle =
-          getParentSession().getSessionManager().submitBackgroundOperation(backgroundOperation)
+          parentSession.getSessionManager().submitBackgroundOperation(backgroundOperation)
         setBackgroundHandle(backgroundHandle)
       } catch {
         case rejected: RejectedExecutionException =>
@@ -210,6 +190,11 @@ private[hive] class SparkExecuteStatementOperation(
     statementId = UUID.randomUUID().toString
     logInfo(s"Running query '$statement' with $statementId")
     setState(OperationState.RUNNING)
+    // Always use the latest class loader provided by executionHive's state.
+    val executionHiveClassLoader =
+      hiveContext.executionHive.state.getConf.getClassLoader
+    Thread.currentThread().setContextClassLoader(executionHiveClassLoader)
+
     HiveThriftServer2.listener.onStatementStart(
       statementId,
       parentSession.getSessionHandle.getSessionId.toString,
@@ -279,43 +264,4 @@ private[hive] class SparkExecuteStatementOperation(
       }
     }
   }
-
-  /**
-   * If there are query specific settings to overlay, then create a copy of config
-   * There are two cases we need to clone the session config that's being passed to hive driver
-   * 1. Async query -
-   *    If the client changes a config setting, that shouldn't reflect in the execution
-   *    already underway
-   * 2. confOverlay -
-   *    The query specific settings should only be applied to the query config and not session
-   * @return new configuration
-   * @throws HiveSQLException
-   */
-  private def getConfigForOperation(): HiveConf = {
-    var sqlOperationConf = getParentSession().getHiveConf()
-    if (!getConfOverlay().isEmpty() || runInBackground) {
-      // clone the partent session config for this query
-      sqlOperationConf = new HiveConf(sqlOperationConf)
-
-      // apply overlay query specific settings, if any
-      getConfOverlay().asScala.foreach { case (k, v) =>
-        try {
-          sqlOperationConf.verifyAndSet(k, v)
-        } catch {
-          case e: IllegalArgumentException =>
-            throw new HiveSQLException("Error applying statement specific settings", e)
-        }
-      }
-    }
-    return sqlOperationConf
-  }
-
-  private def getCurrentHive(): Hive = {
-    try {
-      return Hive.get()
-    } catch {
-      case e: HiveException =>
-        throw new HiveSQLException("Failed to get current Hive object", e);
-    }
-  }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
index 92ac0ec3fca29..33aaead3fbf96 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -36,7 +36,7 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext:
   extends SessionManager(hiveServer)
   with ReflectedCompositeService {
 
-  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)
+  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager()
 
   override def init(hiveConf: HiveConf) {
     setSuperField(this, "hiveConf", hiveConf)
@@ -60,13 +60,15 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext:
       sessionConf: java.util.Map[String, String],
       withImpersonation: Boolean,
       delegationToken: String): SessionHandle = {
-    hiveContext.openSession()
     val sessionHandle =
       super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation,
           delegationToken)
     val session = super.getSession(sessionHandle)
     HiveThriftServer2.listener.onSessionCreated(
       session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
+    val ctx = hiveContext.newSession()
+    ctx.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
+    sparkSqlOperationManager.sessionToContexts += sessionHandle -> ctx
     sessionHandle
   }
 
@@ -74,7 +76,6 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext:
     HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
     super.closeSession(sessionHandle)
     sparkSqlOperationManager.sessionToActivePool -= sessionHandle
-
-    hiveContext.detachSession()
+    sparkSqlOperationManager.sessionToContexts.remove(sessionHandle)
   }
 }
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
index c8031ed0f3437..476651a559d2c 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/server/SparkSQLOperationManager.scala
@@ -30,20 +30,21 @@ import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, R
 /**
  * Executes queries using Spark SQL, and maintains a list of handles to active queries.
  */
-private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext)
+private[thriftserver] class SparkSQLOperationManager()
   extends OperationManager with Logging {
 
   val handleToOperation = ReflectionUtils
     .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")
 
   val sessionToActivePool = Map[SessionHandle, String]()
+  val sessionToContexts = Map[SessionHandle, HiveContext]()
 
   override def newExecuteStatementOperation(
       parentSession: HiveSession,
       statement: String,
       confOverlay: JMap[String, String],
       async: Boolean): ExecuteStatementOperation = synchronized {
-
+    val hiveContext = sessionToContexts(parentSession.getSessionHandle)
     val runInBackground = async && hiveContext.hiveThriftServerAsync
     val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
       runInBackground)(hiveContext, sessionToActivePool)
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
index e59a14ec00d5c..76d1591a235c2 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala
@@ -96,7 +96,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       buffer += s"${new Timestamp(new Date().getTime)} - $source> $line"
 
       // If we haven't found all expected answers and another expected answer comes up...
-      if (next < expectedAnswers.size && line.startsWith(expectedAnswers(next))) {
+      if (next < expectedAnswers.size && line.contains(expectedAnswers(next))) {
         next += 1
         // If all expected answers have been found...
         if (next == expectedAnswers.size) {
@@ -159,7 +159,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE hive_test;"
         -> "OK",
       "CACHE TABLE hive_test;"
-        -> "Time taken: ",
+        -> "",
       "SELECT COUNT(*) FROM hive_test;"
         -> "5",
       "DROP TABLE hive_test;"
@@ -180,7 +180,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       "CREATE TABLE hive_test(key INT, val STRING);"
         -> "OK",
       "SHOW TABLES;"
-        -> "Time taken: "
+        -> "hive_test"
     )
 
     runCliWithin(2.minute, Seq("--database", "hive_test_db", "-e", "SHOW TABLES;"))(
@@ -210,7 +210,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       s"LOAD DATA LOCAL INPATH '$dataFilePath' OVERWRITE INTO TABLE sourceTable;"
         -> "OK",
       "INSERT INTO TABLE t1 SELECT key, val FROM sourceTable;"
-        -> "Time taken:",
+        -> "",
       "SELECT count(key) FROM t1;"
         -> "5",
       "DROP TABLE t1;"
diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
index 19b2f24456ab0..ff8ca0150649d 100644
--- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
+++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/HiveThriftServer2Suites.scala
@@ -205,6 +205,7 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
     import org.apache.spark.sql.SQLConf
     var defaultV1: String = null
     var defaultV2: String = null
+    var data: ArrayBuffer[Int] = null
 
     withMultipleConnectionJdbcStatement(
       // create table
@@ -214,10 +215,16 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
             "DROP TABLE IF EXISTS test_map",
             "CREATE TABLE test_map(key INT, value STRING)",
             s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map",
-            "CACHE TABLE test_table AS SELECT key FROM test_map ORDER BY key DESC")
+            "CACHE TABLE test_table AS SELECT key FROM test_map ORDER BY key DESC",
+            "CREATE DATABASE db1")
 
         queries.foreach(statement.execute)
 
+        val plan = statement.executeQuery("explain select * from test_table")
+        plan.next()
+        plan.next()
+        assert(plan.getString(1).contains("InMemoryColumnarTableScan"))
+
         val rs1 = statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
         val buf1 = new collection.mutable.ArrayBuffer[Int]()
         while (rs1.next()) {
@@ -233,6 +240,8 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
         rs2.close()
 
         assert(buf1 === buf2)
+
+        data = buf1
       },
 
       // first session, we get the default value of the session status
@@ -289,56 +298,51 @@ class HiveThriftBinaryServerSuite extends HiveThriftJdbcTest {
         rs2.close()
       },
 
-      // accessing the cached data in another session
+      // try to access the cached data in another session
       { statement =>
 
-        val rs1 = statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
-        val buf1 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs1.next()) {
-          buf1 += rs1.getInt(1)
+        // Cached temporary table can't be accessed by other sessions
+        intercept[SQLException] {
+          statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
         }
-        rs1.close()
 
-        val rs2 = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
-        val buf2 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs2.next()) {
-          buf2 += rs2.getInt(1)
+        val plan = statement.executeQuery("explain select key from test_map ORDER BY key DESC")
+        plan.next()
+        plan.next()
+        assert(plan.getString(1).contains("InMemoryColumnarTableScan"))
+
+        val rs = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
+        val buf = new collection.mutable.ArrayBuffer[Int]()
+        while (rs.next()) {
+          buf += rs.getInt(1)
         }
-        rs2.close()
+        rs.close()
+        assert(buf === data)
+      },
 
-        assert(buf1 === buf2)
-        statement.executeQuery("UNCACHE TABLE test_table")
+      // switch another database
+      { statement =>
+        statement.execute("USE db1")
 
-        // TODO need to figure out how to determine if the data loaded from cache
-        val rs3 = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
-        val buf3 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs3.next()) {
-          buf3 += rs3.getInt(1)
+        // there is no test_map table in db1
+        intercept[SQLException] {
+          statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
         }
-        rs3.close()
 
-        assert(buf1 === buf3)
+        statement.execute("CREATE TABLE test_map2(key INT, value STRING)")
       },
 
-      // accessing the uncached table
+      // access default database
       { statement =>
 
-        // TODO need to figure out how to determine if the data loaded from cache
-        val rs1 = statement.executeQuery("SELECT key FROM test_table ORDER BY KEY DESC")
-        val buf1 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs1.next()) {
-          buf1 += rs1.getInt(1)
-        }
-        rs1.close()
-
-        val rs2 = statement.executeQuery("SELECT key FROM test_map ORDER BY KEY DESC")
-        val buf2 = new collection.mutable.ArrayBuffer[Int]()
-        while (rs2.next()) {
-          buf2 += rs2.getInt(1)
+        // current database should still be `default`
+        intercept[SQLException] {
+          statement.executeQuery("SELECT key FROM test_map2")
         }
-        rs2.close()
 
-        assert(buf1 === buf2)
+        statement.execute("USE db1")
+        // access test_map2
+        statement.executeQuery("SELECT key from test_map2")
       }
     )
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 17de8ef56f9a6..dad1e2347c387 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -25,7 +25,6 @@ import java.util.concurrent.TimeUnit
 import scala.collection.JavaConverters._
 import scala.collection.mutable.HashMap
 import scala.language.implicitConversions
-import scala.concurrent.duration._
 
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.common.StatsSetupConst
@@ -34,32 +33,49 @@ import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
 import org.apache.hadoop.hive.ql.metadata.Table
 import org.apache.hadoop.hive.ql.parse.VariableSubstitution
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde2.io.{DateWritable, TimestampWritable}
 
-import org.apache.spark.Logging
-import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.sql._
+import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
-import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier, ParserDialect}
+import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.execution.{ExecutedCommand, ExtractPythonUDFs, SetCommand}
-import org.apache.spark.sql.execution.datasources.{PreWriteCheck, PreInsertCastAndRename, DataSourceStrategy}
+import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, SqlParser}
+import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
+import org.apache.spark.sql.execution.{CacheManager, ExecutedCommand, ExtractPythonUDFs, SetCommand}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SparkContext}
 
 
 /**
  * This is the HiveQL Dialect, this dialect is strongly bind with HiveContext
  */
-private[hive] class HiveQLDialect extends ParserDialect {
+private[hive] class HiveQLDialect(sqlContext: HiveContext) extends ParserDialect {
   override def parse(sqlText: String): LogicalPlan = {
-    HiveQl.parseSql(sqlText)
+    sqlContext.executionHive.withHiveState {
+      HiveQl.parseSql(sqlText)
+    }
+  }
+}
+
+/**
+ * Returns the current database of metadataHive.
+ */
+private[hive] case class CurrentDatabase(ctx: HiveContext)
+  extends LeafExpression with CodegenFallback {
+  override def dataType: DataType = StringType
+  override def foldable: Boolean = true
+  override def nullable: Boolean = false
+  override def eval(input: InternalRow): Any = {
+    UTF8String.fromString(ctx.metadataHive.currentDatabase)
   }
 }
 
@@ -69,13 +85,29 @@ private[hive] class HiveQLDialect extends ParserDialect {
  *
  * @since 1.0.0
  */
-class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
+class HiveContext private[hive](
+    sc: SparkContext,
+    cacheManager: CacheManager,
+    @transient execHive: ClientWrapper,
+    @transient metaHive: ClientInterface) extends SQLContext(sc, cacheManager) with Logging {
   self =>
 
-  import HiveContext._
+  def this(sc: SparkContext) = this(sc, new CacheManager, null, null)
+  def this(sc: JavaSparkContext) = this(sc.sc)
+
+  import org.apache.spark.sql.hive.HiveContext._
 
   logDebug("create HiveContext")
 
+  /**
+   * Returns a new HiveContext as new session, which will have separated SQLConf, UDF/UDAF,
+   * temporary tables and SessionState, but sharing the same CacheManager, IsolatedClientLoader
+   * and Hive client (both of execution and metadata) with existing HiveContext.
+   */
+  override def newSession(): HiveContext = {
+    new HiveContext(sc, cacheManager, executionHive.newSession(), metadataHive.newSession())
+  }
+
   /**
    * When true, enables an experimental feature where metastore tables that use the parquet SerDe
    * are automatically converted to use the Spark SQL parquet table scan, instead of the Hive
@@ -157,14 +189,18 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * for storing persistent metadata, and only point to a dummy metastore in a temporary directory.
    */
   @transient
-  protected[hive] lazy val executionHive: ClientWrapper = {
+  protected[hive] lazy val executionHive: ClientWrapper = if (execHive != null) {
+    execHive
+  } else {
     logInfo(s"Initializing execution hive, version $hiveExecutionVersion")
-    new ClientWrapper(
+    val loader = new IsolatedClientLoader(
       version = IsolatedClientLoader.hiveVersion(hiveExecutionVersion),
+      execJars = Seq(),
       config = newTemporaryConfiguration(),
-      initClassLoader = Utils.getContextOrSparkClassLoader)
+      isolationOn = false,
+      baseClassLoader = Utils.getContextOrSparkClassLoader)
+    loader.createClient().asInstanceOf[ClientWrapper]
   }
-  SessionState.setCurrentSessionState(executionHive.state)
 
   /**
    * Overrides default Hive configurations to avoid breaking changes to Spark SQL users.
@@ -182,7 +218,9 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
    * in the hive-site.xml file.
    */
   @transient
-  protected[hive] lazy val metadataHive: ClientInterface = {
+  protected[hive] lazy val metadataHive: ClientInterface = if (metaHive != null) {
+    metaHive
+  } else {
     val metaVersion = IsolatedClientLoader.hiveVersion(hiveMetastoreVersion)
 
     // We instantiate a HiveConf here to read in the hive-site.xml file and then pass the options
@@ -268,14 +306,10 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
         barrierPrefixes = hiveMetastoreBarrierPrefixes,
         sharedPrefixes = hiveMetastoreSharedPrefixes)
     }
-    isolatedLoader.client
+    isolatedLoader.createClient()
   }
 
   protected[sql] override def parseSql(sql: String): LogicalPlan = {
-    var state = SessionState.get()
-    if (state == null) {
-      SessionState.setCurrentSessionState(tlSession.get().asInstanceOf[SQLSession].sessionState)
-    }
     super.parseSql(substitutor.substitute(hiveconf, sql))
   }
 
@@ -384,8 +418,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     }
   }
 
-  protected[hive] def hiveconf = tlSession.get().asInstanceOf[this.SQLSession].hiveconf
-
   override def setConf(key: String, value: String): Unit = {
     super.setConf(key, value)
     executionHive.runSqlHive(s"SET $key=$value")
@@ -402,7 +434,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     setConf(entry.key, entry.stringConverter(value))
   }
 
-    /* A catalyst metadata catalog that points to the Hive Metastore. */
+  /* A catalyst metadata catalog that points to the Hive Metastore. */
   @transient
   override protected[sql] lazy val catalog =
     new HiveMetastoreCatalog(metadataHive, this) with OverrideCatalog
@@ -410,7 +442,13 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
   // Note that HiveUDFs will be overridden by functions registered in this context.
   @transient
   override protected[sql] lazy val functionRegistry: FunctionRegistry =
-    new HiveFunctionRegistry(FunctionRegistry.builtin)
+    new HiveFunctionRegistry(FunctionRegistry.builtin.copy())
+
+  // The Hive UDF current_database() is foldable, will be evaluated by optimizer, but the optimizer
+  // can't access the SessionState of metadataHive.
+  functionRegistry.registerFunction(
+    "current_database",
+    (expressions: Seq[Expression]) => new CurrentDatabase(this))
 
   /* An analyzer that uses the Hive metastore. */
   @transient
@@ -430,10 +468,6 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
       )
     }
 
-  override protected[sql] def createSession(): SQLSession = {
-    new this.SQLSession()
-  }
-
   /** Overridden by child classes that need to set configuration before the client init. */
   protected def configure(): Map[String, String] = {
     // Hive 0.14.0 introduces timeout operations in HiveConf, and changes default values of a bunch
@@ -488,41 +522,40 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
     }.toMap
   }
 
-  protected[hive] class SQLSession extends super.SQLSession {
-    protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
-      override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
-    }
-
-    /**
-     * SQLConf and HiveConf contracts:
-     *
-     * 1. reuse existing started SessionState if any
-     * 2. when the Hive session is first initialized, params in HiveConf will get picked up by the
-     *    SQLConf.  Additionally, any properties set by set() or a SET command inside sql() will be
-     *    set in the SQLConf *as well as* in the HiveConf.
-     */
-    protected[hive] lazy val sessionState: SessionState = {
-      var state = SessionState.get()
-      if (state == null) {
-        state = new SessionState(new HiveConf(classOf[SessionState]))
-        SessionState.start(state)
-      }
-      state
-    }
+  /**
+   * SQLConf and HiveConf contracts:
+   *
+   * 1. create a new SessionState for each HiveContext
+   * 2. when the Hive session is first initialized, params in HiveConf will get picked up by the
+   *    SQLConf.  Additionally, any properties set by set() or a SET command inside sql() will be
+   *    set in the SQLConf *as well as* in the HiveConf.
+   */
+  @transient
+  protected[hive] lazy val hiveconf: HiveConf = {
+    val c = executionHive.conf
+    setConf(c.getAllProperties)
+    c
+  }
 
-    protected[hive] lazy val hiveconf: HiveConf = {
-      setConf(sessionState.getConf.getAllProperties)
-      sessionState.getConf
-    }
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    override def dialect: String = getConf(SQLConf.DIALECT, "hiveql")
+    override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
   }
 
-  override protected[sql] def dialectClassName = if (conf.dialect == "hiveql") {
+  protected[sql] override def dialectClassName = if (conf.dialect == "hiveql") {
     classOf[HiveQLDialect].getCanonicalName
   } else {
     super.dialectClassName
   }
 
+  protected[sql] override def getSQLDialect(): ParserDialect = {
+    if (conf.dialect == "hiveql") {
+      new HiveQLDialect(this)
+    } else {
+      super.getSQLDialect()
+    }
+  }
+
   @transient
   private val hivePlanner = new SparkPlanner with HiveStrategies {
     val hiveContext = self
@@ -598,6 +631,14 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) with Logging {
         case _ => super.simpleString
       }
   }
+
+  protected[sql] override def addJar(path: String): Unit = {
+    // Add jar to Hive and classloader
+    executionHive.addJar(path)
+    metadataHive.addJar(path)
+    Thread.currentThread().setContextClassLoader(executionHive.clientLoader.classLoader)
+    super.addJar(path)
+  }
 }
 
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 2bf22f5449641..250c232856885 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -25,29 +25,27 @@ import scala.collection.mutable.ArrayBuffer
 
 import org.apache.hadoop.hive.conf.HiveConf
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars
-import org.apache.hadoop.hive.serde.serdeConstants
-import org.apache.hadoop.hive.ql.{ErrorMsg, Context}
-import org.apache.hadoop.hive.ql.exec.{FunctionRegistry, FunctionInfo}
+import org.apache.hadoop.hive.ql.exec.{FunctionInfo, FunctionRegistry}
 import org.apache.hadoop.hive.ql.lib.Node
 import org.apache.hadoop.hive.ql.parse._
 import org.apache.hadoop.hive.ql.plan.PlanUtils
 import org.apache.hadoop.hive.ql.session.SessionState
+import org.apache.hadoop.hive.ql.{Context, ErrorMsg}
+import org.apache.hadoop.hive.serde.serdeConstants
 import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.catalyst
+import org.apache.spark.sql.{AnalysisException, catalyst}
 import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans._
-import org.apache.spark.sql.catalyst.plans.logical
+import org.apache.spark.sql.catalyst.plans.{logical, _}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
 import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.execution.datasources.DescribeCommand
 import org.apache.spark.sql.hive.HiveShim._
 import org.apache.spark.sql.hive.client._
-import org.apache.spark.sql.hive.execution.{HiveNativeCommand, DropTable, AnalyzeTable, HiveScriptIOSchema}
+import org.apache.spark.sql.hive.execution.{AnalyzeTable, DropTable, HiveNativeCommand, HiveScriptIOSchema}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.CalendarInterval
 import org.apache.spark.util.random.RandomSampler
@@ -268,7 +266,7 @@ private[hive] object HiveQl extends Logging {
     node
   }
 
-  private def createContext(): Context = new Context(SessionState.get().getConf())
+  private def createContext(): Context = new Context(hiveConf)
 
   private def getAst(sql: String, context: Context) =
     ParseUtils.findRootNonNullToken((new ParseDriver).parse(sql, context))
@@ -277,12 +275,16 @@ private[hive] object HiveQl extends Logging {
    * Returns the HiveConf
    */
   private[this] def hiveConf: HiveConf = {
-    val ss = SessionState.get() // SessionState is lazy initialization, it can be null here
+    var ss = SessionState.get()
+    // SessionState is lazy initialization, it can be null here
     if (ss == null) {
-      new HiveConf()
-    } else {
-      ss.getConf
+      val original = Thread.currentThread().getContextClassLoader
+      val conf = new HiveConf(classOf[SessionState])
+      conf.setClassLoader(original)
+      ss = new SessionState(conf)
+      SessionState.start(ss)
     }
+    ss.getConf
   }
 
   /** Returns a LogicalPlan for a given HiveQL string. */
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
index 915eae9d21e23..9d9a55edd7314 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientInterface.scala
@@ -178,6 +178,15 @@ private[hive] trait ClientInterface {
       holdDDLTime: Boolean,
       listBucketingEnabled: Boolean): Unit
 
+  /** Add a jar into class loader */
+  def addJar(path: String): Unit
+
+  /** Return a ClientInterface as new session, that will share the class loader and Hive client */
+  def newSession(): ClientInterface
+
+  /** Run a function within Hive state (SessionState, HiveConf, Hive client and class loader) */
+  def withHiveState[A](f: => A): A
+
   /** Used for testing only.  Removes all metadata from this instance of Hive. */
   def reset(): Unit
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
index 8f6d448b2aef4..3dce86c480747 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/ClientWrapper.scala
@@ -60,7 +60,8 @@ import org.apache.spark.util.{CircularBuffer, Utils}
 private[hive] class ClientWrapper(
     override val version: HiveVersion,
     config: Map[String, String],
-    initClassLoader: ClassLoader)
+    initClassLoader: ClassLoader,
+    val clientLoader: IsolatedClientLoader)
   extends ClientInterface
   with Logging {
 
@@ -150,31 +151,29 @@ private[hive] class ClientWrapper(
     // Switch to the initClassLoader.
     Thread.currentThread().setContextClassLoader(initClassLoader)
     val ret = try {
-      val oldState = SessionState.get()
-      if (oldState == null) {
-        val initialConf = new HiveConf(classOf[SessionState])
-        // HiveConf is a Hadoop Configuration, which has a field of classLoader and
-        // the initial value will be the current thread's context class loader
-        // (i.e. initClassLoader at here).
-        // We call initialConf.setClassLoader(initClassLoader) at here to make
-        // this action explicit.
-        initialConf.setClassLoader(initClassLoader)
-        config.foreach { case (k, v) =>
-          if (k.toLowerCase.contains("password")) {
-            logDebug(s"Hive Config: $k=xxx")
-          } else {
-            logDebug(s"Hive Config: $k=$v")
-          }
-          initialConf.set(k, v)
+      val initialConf = new HiveConf(classOf[SessionState])
+      // HiveConf is a Hadoop Configuration, which has a field of classLoader and
+      // the initial value will be the current thread's context class loader
+      // (i.e. initClassLoader at here).
+      // We call initialConf.setClassLoader(initClassLoader) at here to make
+      // this action explicit.
+      initialConf.setClassLoader(initClassLoader)
+      config.foreach { case (k, v) =>
+        if (k.toLowerCase.contains("password")) {
+          logDebug(s"Hive Config: $k=xxx")
+        } else {
+          logDebug(s"Hive Config: $k=$v")
         }
-        val newState = new SessionState(initialConf)
-        SessionState.start(newState)
-        newState.out = new PrintStream(outputBuffer, true, "UTF-8")
-        newState.err = new PrintStream(outputBuffer, true, "UTF-8")
-        newState
-      } else {
-        oldState
+        initialConf.set(k, v)
+      }
+      val state = new SessionState(initialConf)
+      if (clientLoader.cachedHive != null) {
+        Hive.set(clientLoader.cachedHive.asInstanceOf[Hive])
       }
+      SessionState.start(state)
+      state.out = new PrintStream(outputBuffer, true, "UTF-8")
+      state.err = new PrintStream(outputBuffer, true, "UTF-8")
+      state
     } finally {
       Thread.currentThread().setContextClassLoader(original)
     }
@@ -188,11 +187,6 @@ private[hive] class ClientWrapper(
     conf.get(key, defaultValue)
   }
 
-  // TODO: should be a def?s
-  // When we create this val client, the HiveConf of it (conf) is the one associated with state.
-  @GuardedBy("this")
-  private var client = Hive.get(conf)
-
   // We use hive's conf for compatibility.
   private val retryLimit = conf.getIntVar(HiveConf.ConfVars.METASTORETHRIFTFAILURERETRIES)
   private val retryDelayMillis = shim.getMetastoreClientConnectRetryDelayMillis(conf)
@@ -200,7 +194,7 @@ private[hive] class ClientWrapper(
   /**
    * Runs `f` with multiple retries in case the hive metastore is temporarily unreachable.
    */
-  private def retryLocked[A](f: => A): A = synchronized {
+  private def retryLocked[A](f: => A): A = clientLoader.synchronized {
     // Hive sometimes retries internally, so set a deadline to avoid compounding delays.
     val deadline = System.nanoTime + (retryLimit * retryDelayMillis * 1e6).toLong
     var numTries = 0
@@ -215,13 +209,8 @@ private[hive] class ClientWrapper(
           logWarning(
             "HiveClientWrapper got thrift exception, destroying client and retrying " +
               s"(${retryLimit - numTries} tries remaining)", e)
+          clientLoader.cachedHive = null
           Thread.sleep(retryDelayMillis)
-          try {
-            client = Hive.get(state.getConf, true)
-          } catch {
-            case e: Exception if causedByThrift(e) =>
-              logWarning("Failed to refresh hive client, will retry.", e)
-          }
       }
     } while (numTries <= retryLimit && System.nanoTime < deadline)
     if (System.nanoTime > deadline) {
@@ -242,13 +231,26 @@ private[hive] class ClientWrapper(
     false
   }
 
+  def client: Hive = {
+    if (clientLoader.cachedHive != null) {
+      clientLoader.cachedHive.asInstanceOf[Hive]
+    } else {
+      val c = Hive.get(conf)
+      clientLoader.cachedHive = c
+      c
+    }
+  }
+
   /**
    * Runs `f` with ThreadLocal session state and classloaders configured for this version of hive.
    */
-  private def withHiveState[A](f: => A): A = retryLocked {
+  def withHiveState[A](f: => A): A = retryLocked {
     val original = Thread.currentThread().getContextClassLoader
     // Set the thread local metastore client to the client associated with this ClientWrapper.
     Hive.set(client)
+    // The classloader in clientLoader could be changed after addJar, always use the latest
+    // classloader
+    state.getConf.setClassLoader(clientLoader.classLoader)
     // setCurrentSessionState will use the classLoader associated
     // with the HiveConf in `state` to override the context class loader of the current
     // thread.
@@ -545,6 +547,15 @@ private[hive] class ClientWrapper(
       listBucketingEnabled)
   }
 
+  def addJar(path: String): Unit = {
+    clientLoader.addJar(path)
+    runSqlHive(s"ADD JAR $path")
+  }
+
+  def newSession(): ClientWrapper = {
+    clientLoader.createClient().asInstanceOf[ClientWrapper]
+  }
+
   def reset(): Unit = withHiveState {
     client.getAllTables("default").asScala.foreach { t =>
         logDebug(s"Deleting table $t")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 1fe4cba9571f3..567e4d7b411ec 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -22,6 +22,7 @@ import java.lang.reflect.InvocationTargetException
 import java.net.{URL, URLClassLoader}
 import java.util
 
+import scala.collection.mutable
 import scala.language.reflectiveCalls
 import scala.util.Try
 
@@ -148,53 +149,75 @@ private[hive] class IsolatedClientLoader(
     name.replaceAll("\\.", "/") + ".class"
 
   /** The classloader that is used to load an isolated version of Hive. */
-  protected val classLoader: ClassLoader = new URLClassLoader(allJars, rootClassLoader) {
-    override def loadClass(name: String, resolve: Boolean): Class[_] = {
-      val loaded = findLoadedClass(name)
-      if (loaded == null) doLoadClass(name, resolve) else loaded
-    }
-
-    def doLoadClass(name: String, resolve: Boolean): Class[_] = {
-      val classFileName = name.replaceAll("\\.", "/") + ".class"
-      if (isBarrierClass(name) && isolationOn) {
-        // For barrier classes, we construct a new copy of the class.
-        val bytes = IOUtils.toByteArray(baseClassLoader.getResourceAsStream(classFileName))
-        logDebug(s"custom defining: $name - ${util.Arrays.hashCode(bytes)}")
-        defineClass(name, bytes, 0, bytes.length)
-      } else if (!isSharedClass(name)) {
-        logDebug(s"hive class: $name - ${getResource(classToPath(name))}")
-        super.loadClass(name, resolve)
-      } else {
-        // For shared classes, we delegate to baseClassLoader.
-        logDebug(s"shared class: $name")
-        baseClassLoader.loadClass(name)
+  private[hive] var classLoader: ClassLoader = if (isolationOn) {
+    new URLClassLoader(allJars, rootClassLoader) {
+      override def loadClass(name: String, resolve: Boolean): Class[_] = {
+        val loaded = findLoadedClass(name)
+        if (loaded == null) doLoadClass(name, resolve) else loaded
+      }
+      def doLoadClass(name: String, resolve: Boolean): Class[_] = {
+        val classFileName = name.replaceAll("\\.", "/") + ".class"
+        if (isBarrierClass(name)) {
+          // For barrier classes, we construct a new copy of the class.
+          val bytes = IOUtils.toByteArray(baseClassLoader.getResourceAsStream(classFileName))
+          logDebug(s"custom defining: $name - ${util.Arrays.hashCode(bytes)}")
+          defineClass(name, bytes, 0, bytes.length)
+        } else if (!isSharedClass(name)) {
+          logDebug(s"hive class: $name - ${getResource(classToPath(name))}")
+          super.loadClass(name, resolve)
+        } else {
+          // For shared classes, we delegate to baseClassLoader.
+          logDebug(s"shared class: $name")
+          baseClassLoader.loadClass(name)
+        }
       }
     }
+  } else {
+    baseClassLoader
   }
 
-  // Pre-reflective instantiation setup.
-  logDebug("Initializing the logger to avoid disaster...")
-  Thread.currentThread.setContextClassLoader(classLoader)
+  private[hive] def addJar(path: String): Unit = synchronized {
+    val jarURL = new java.io.File(path).toURI.toURL
+    // TODO: we should avoid of stacking classloaders (use a single URLClassLoader and add jars
+    // to that)
+    classLoader = new java.net.URLClassLoader(Array(jarURL), classLoader)
+  }
 
   /** The isolated client interface to Hive. */
-  val client: ClientInterface = try {
-    classLoader
-      .loadClass(classOf[ClientWrapper].getName)
-      .getConstructors.head
-      .newInstance(version, config, classLoader)
-      .asInstanceOf[ClientInterface]
-  } catch {
-    case e: InvocationTargetException =>
-      if (e.getCause().isInstanceOf[NoClassDefFoundError]) {
-        val cnf = e.getCause().asInstanceOf[NoClassDefFoundError]
-        throw new ClassNotFoundException(
-          s"$cnf when creating Hive client using classpath: ${execJars.mkString(", ")}\n" +
-           "Please make sure that jars for your version of hive and hadoop are included in the " +
-          s"paths passed to ${HiveContext.HIVE_METASTORE_JARS}.")
-      } else {
-        throw e
-      }
-  } finally {
-    Thread.currentThread.setContextClassLoader(baseClassLoader)
+  private[hive] def createClient(): ClientInterface = {
+    if (!isolationOn) {
+      return new ClientWrapper(version, config, baseClassLoader, this)
+    }
+    // Pre-reflective instantiation setup.
+    logDebug("Initializing the logger to avoid disaster...")
+    val origLoader = Thread.currentThread().getContextClassLoader
+    Thread.currentThread.setContextClassLoader(classLoader)
+
+    try {
+      classLoader
+        .loadClass(classOf[ClientWrapper].getName)
+        .getConstructors.head
+        .newInstance(version, config, classLoader, this)
+        .asInstanceOf[ClientInterface]
+    } catch {
+      case e: InvocationTargetException =>
+        if (e.getCause().isInstanceOf[NoClassDefFoundError]) {
+          val cnf = e.getCause().asInstanceOf[NoClassDefFoundError]
+          throw new ClassNotFoundException(
+            s"$cnf when creating Hive client using classpath: ${execJars.mkString(", ")}\n" +
+            "Please make sure that jars for your version of hive and hadoop are included in the " +
+            s"paths passed to ${HiveContext.HIVE_METASTORE_JARS}.")
+        } else {
+          throw e
+        }
+    } finally {
+      Thread.currentThread.setContextClassLoader(origLoader)
+    }
   }
+
+  /**
+   * The place holder for shared Hive client for all the HiveContext sessions (they share an
+   * IsolatedClientLoader).
+   */
+  private[hive] var cachedHive: Any = null
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 9f654eed5761c..51ec92afd06ed 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -18,18 +18,18 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.hadoop.hive.metastore.MetaStoreUtils
+
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.{TableIdentifier, SqlParser}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.analysis.EliminateSubQueries
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.catalyst.util._
 import org.apache.spark.sql.execution.RunnableCommand
-import org.apache.spark.sql.execution.datasources.{ResolvedDataSource, LogicalRelation}
+import org.apache.spark.sql.execution.datasources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.hive.HiveContext
 import org.apache.spark.sql.sources._
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 /**
  * Analyzes the given table in the current database to generate statistics, which will be
@@ -86,26 +86,7 @@ case class AddJar(path: String) extends RunnableCommand {
   }
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
-    val hiveContext = sqlContext.asInstanceOf[HiveContext]
-    val currentClassLoader = Utils.getContextOrSparkClassLoader
-
-    // Add jar to current context
-    val jarURL = new java.io.File(path).toURI.toURL
-    val newClassLoader = new java.net.URLClassLoader(Array(jarURL), currentClassLoader)
-    Thread.currentThread.setContextClassLoader(newClassLoader)
-    // We need to explicitly set the class loader associated with the conf in executionHive's
-    // state because this class loader will be used as the context class loader of the current
-    // thread to execute any Hive command.
-    // We cannot use `org.apache.hadoop.hive.ql.metadata.Hive.get().getConf()` because Hive.get()
-    // returns the value of a thread local variable and its HiveConf may not be the HiveConf
-    // associated with `executionHive.state` (for example, HiveContext is created in one thread
-    // and then add jar is called from another thread).
-    hiveContext.executionHive.state.getConf.setClassLoader(newClassLoader)
-    // Add jar to isolated hive (metadataHive) class loader.
-    hiveContext.runSqlHive(s"ADD JAR $path")
-
-    // Add jar to executors
-    hiveContext.sparkContext.addJar(path)
+    sqlContext.addJar(path)
 
     Seq(Row(0))
   }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index be335a47dcabd..ff39ccb7c1ea5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -116,27 +116,18 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
   override def executePlan(plan: LogicalPlan): this.QueryExecution =
     new this.QueryExecution(plan)
 
-  // Make sure we set those test specific confs correctly when we create
-  // the SQLConf as well as when we call clear.
-  override protected[sql] def createSession(): SQLSession = {
-    new this.SQLSession()
-  }
-
-  protected[hive] class SQLSession extends super.SQLSession {
-    protected[sql] override lazy val conf: SQLConf = new SQLConf {
-      // TODO as in unit test, conf.clear() probably be called, all of the value will be cleared.
-      // The super.getConf(SQLConf.DIALECT) is "sql" by default, we need to set it as "hiveql"
-      override def dialect: String = super.getConf(SQLConf.DIALECT, "hiveql")
-      override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
+  protected[sql] override lazy val conf: SQLConf = new SQLConf {
+    // The super.getConf(SQLConf.DIALECT) is "sql" by default, we need to set it as "hiveql"
+    override def dialect: String = super.getConf(SQLConf.DIALECT, "hiveql")
+    override def caseSensitiveAnalysis: Boolean = getConf(SQLConf.CASE_SENSITIVE, false)
 
-      clear()
+    clear()
 
-      override def clear(): Unit = {
-        super.clear()
+    override def clear(): Unit = {
+      super.clear()
 
-        TestHiveContext.overrideConfs.map {
-          case (key, value) => setConfString(key, value)
-        }
+      TestHiveContext.overrideConfs.map {
+        case (key, value) => setConfString(key, value)
       }
     }
   }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
index 79cf40aba4bf2..528a7398b10df 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveQlSuite.scala
@@ -17,22 +17,15 @@
 
 package org.apache.spark.sql.hive
 
-import org.apache.hadoop.hive.conf.HiveConf
-import org.apache.hadoop.hive.ql.session.SessionState
 import org.apache.hadoop.hive.serde.serdeConstants
+import org.scalatest.BeforeAndAfterAll
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.AnalysisException
-import org.apache.spark.sql.hive.client.{ManagedTable, HiveColumn, ExternalTable, HiveTable}
-import org.scalatest.BeforeAndAfterAll
+import org.apache.spark.sql.hive.client.{ExternalTable, HiveColumn, HiveTable, ManagedTable}
 
 
 class HiveQlSuite extends SparkFunSuite with BeforeAndAfterAll {
-  override def beforeAll() {
-    if (SessionState.get() == null) {
-      SessionState.start(new HiveConf())
-    }
-  }
-
   private def extractTableDesc(sql: String): (HiveTable, Boolean) = {
     HiveQl.createPlan(sql).collect {
       case CreateTableAsSelect(desc, child, allowExisting) => (desc, allowExisting)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
index 2da22ec2379f3..c6d034a23a1c6 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/VersionsSuite.scala
@@ -53,7 +53,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
   test("success sanity check") {
     val badClient = IsolatedClientLoader.forVersion(HiveContext.hiveExecutionVersion,
       buildConf(),
-      ivyPath).client
+      ivyPath).createClient()
     val db = new HiveDatabase("default", "")
     badClient.createDatabase(db)
   }
@@ -83,7 +83,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
   ignore("failure sanity check") {
     val e = intercept[Throwable] {
       val badClient = quietly {
-        IsolatedClientLoader.forVersion("13", buildConf(), ivyPath).client
+        IsolatedClientLoader.forVersion("13", buildConf(), ivyPath).createClient()
       }
     }
     assert(getNestedMessages(e) contains "Unknown column 'A0.OWNER_NAME' in 'field list'")
@@ -97,7 +97,7 @@ class VersionsSuite extends SparkFunSuite with Logging {
     test(s"$version: create client") {
       client = null
       System.gc() // Hack to avoid SEGV on some JVM versions.
-      client = IsolatedClientLoader.forVersion(version, buildConf(), ivyPath).client
+      client = IsolatedClientLoader.forVersion(version, buildConf(), ivyPath).createClient()
     }
 
     test(s"$version: createDatabase") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index fe63ad5683195..2878500453141 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -1133,6 +1133,38 @@ class HiveQuerySuite extends HiveComparisonTest with BeforeAndAfter {
     conf.clear()
   }
 
+  test("current_database with multiple sessions") {
+    sql("create database a")
+    sql("use a")
+    val s2 = newSession()
+    s2.sql("create database b")
+    s2.sql("use b")
+
+    assert(sql("select current_database()").first() === Row("a"))
+    assert(s2.sql("select current_database()").first() === Row("b"))
+
+    try {
+      sql("create table test_a(key INT, value STRING)")
+      s2.sql("create table test_b(key INT, value STRING)")
+
+      sql("select * from test_a")
+      intercept[AnalysisException] {
+        sql("select * from test_b")
+      }
+      sql("select * from b.test_b")
+
+      s2.sql("select * from test_b")
+      intercept[AnalysisException] {
+        s2.sql("select * from test_a")
+      }
+      s2.sql("select * from a.test_a")
+    } finally {
+      sql("DROP TABLE IF EXISTS test_a")
+      s2.sql("DROP TABLE IF EXISTS test_b")
+    }
+
+  }
+
   createQueryTest("select from thrift based table",
     "SELECT * from src_thrift")
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ec5b83b98e401..ccc15eaa63f42 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -160,10 +160,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("show functions") {
-    val allFunctions =
+    val allBuiltinFunctions =
       (FunctionRegistry.builtin.listFunction().toSet[String] ++
         org.apache.hadoop.hive.ql.exec.FunctionRegistry.getFunctionNames.asScala).toList.sorted
-    checkAnswer(sql("SHOW functions"), allFunctions.map(Row(_)))
+    // The TestContext is shared by all the test cases, some functions may be registered before
+    // this, so we check that all the builtin functions are returned.
+    val allFunctions = sql("SHOW functions").collect().map(r => r(0))
+    allBuiltinFunctions.foreach { f =>
+      assert(allFunctions.contains(f))
+    }
     checkAnswer(sql("SHOW functions abs"), Row("abs"))
     checkAnswer(sql("SHOW functions 'abs'"), Row("abs"))
     checkAnswer(sql("SHOW functions abc.abs"), Row("abs"))

From 8e67882b905683a1f151679214ef0b575e77c7e1 Mon Sep 17 00:00:00 2001
From: zero323 <matthew.szymkiewicz@gmail.com>
Date: Thu, 8 Oct 2015 18:34:15 -0700
Subject: [PATCH 008/139] =?UTF-8?q?[SPARK-10973]=20[ML]=20[PYTHON]=20=5F?=
 =?UTF-8?q?=5Fgettitem=5F=5F=20method=20throws=20IndexError=20exception=20?=
 =?UTF-8?q?when=20we=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__gettitem__ method throws IndexError exception when we try to access index after the last non-zero entry

    from pyspark.mllib.linalg import Vectors
    sv = Vectors.sparse(5, {1: 3})
    sv[0]
    ## 0.0
    sv[1]
    ## 3.0
    sv[2]
    ## Traceback (most recent call last):
    ##   File "<stdin>", line 1, in <module>
    ##   File "/python/pyspark/mllib/linalg/__init__.py", line 734, in __getitem__
    ##     row_ind = inds[insert_index]
    ## IndexError: index out of bounds

Author: zero323 <matthew.szymkiewicz@gmail.com>

Closes #9009 from zero323/sparse_vector_index_error.
---
 python/pyspark/mllib/linalg/__init__.py |  3 +++
 python/pyspark/mllib/tests.py           | 12 +++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index ea42127f1651f..d903b9030d8ce 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -770,6 +770,9 @@ def __getitem__(self, index):
             raise ValueError("Index %d out of bounds." % index)
 
         insert_index = np.searchsorted(inds, index)
+        if insert_index >= inds.size:
+            return 0.
+
         row_ind = inds[insert_index]
         if row_ind == index:
             return vals[insert_index]
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 96cf13495aa95..2a6a5cd3fe40e 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -237,15 +237,17 @@ def test_conversion(self):
         self.assertTrue(dv.array.dtype == 'float64')
 
     def test_sparse_vector_indexing(self):
-        sv = SparseVector(4, {1: 1, 3: 2})
+        sv = SparseVector(5, {1: 1, 3: 2})
         self.assertEqual(sv[0], 0.)
         self.assertEqual(sv[3], 2.)
         self.assertEqual(sv[1], 1.)
         self.assertEqual(sv[2], 0.)
-        self.assertEqual(sv[-1], 2)
-        self.assertEqual(sv[-2], 0)
-        self.assertEqual(sv[-4], 0)
-        for ind in [4, -5]:
+        self.assertEqual(sv[4], 0.)
+        self.assertEqual(sv[-1], 0.)
+        self.assertEqual(sv[-2], 2.)
+        self.assertEqual(sv[-3], 0.)
+        self.assertEqual(sv[-5], 0.)
+        for ind in [5, -6]:
             self.assertRaises(ValueError, sv.__getitem__, ind)
         for ind in [7.8, '1']:
             self.assertRaises(TypeError, sv.__getitem__, ind)

From fa3e4d8f52995bf632e7eda60dbb776c9f637546 Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Thu, 8 Oct 2015 18:50:27 -0700
Subject: [PATCH 009/139] =?UTF-8?q?[SPARK-11019]=20[STREAMING]=20[FLUME]?=
 =?UTF-8?q?=20Gracefully=20shutdown=20Flume=20receiver=20th=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…reads.

Wait for a minute for the receiver threads to shutdown before interrupting them.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #9041 from harishreedharan/flume-graceful-shutdown.
---
 .../spark/streaming/flume/FlumePollingInputDStream.scala  | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
index 3b936d88abd3e..6737750c3d63e 100644
--- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
+++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/FlumePollingInputDStream.scala
@@ -18,7 +18,7 @@ package org.apache.spark.streaming.flume
 
 
 import java.net.InetSocketAddress
-import java.util.concurrent.{LinkedBlockingQueue, Executors}
+import java.util.concurrent.{Executors, LinkedBlockingQueue, TimeUnit}
 
 import scala.collection.JavaConverters._
 import scala.reflect.ClassTag
@@ -93,7 +93,11 @@ private[streaming] class FlumePollingReceiver(
 
   override def onStop(): Unit = {
     logInfo("Shutting down Flume Polling Receiver")
-    receiverExecutor.shutdownNow()
+    receiverExecutor.shutdown()
+    // Wait upto a minute for the threads to die
+    if (!receiverExecutor.awaitTermination(60, TimeUnit.SECONDS)) {
+      receiverExecutor.shutdownNow()
+    }
     connections.asScala.foreach(_.transceiver.close())
     channelFactory.releaseExternalResources()
   }

From 09841290055770a619a2e72fbaef1a5e694916ae Mon Sep 17 00:00:00 2001
From: Hari Shreedharan <hshreedharan@apache.org>
Date: Thu, 8 Oct 2015 18:53:38 -0700
Subject: [PATCH 010/139] [SPARK-10955] [STREAMING] Add a warning if dynamic
 allocation for Streaming applications

Dynamic allocation can be painful for streaming apps and can lose data. Log a warning for streaming applications if dynamic allocation is enabled.

Author: Hari Shreedharan <hshreedharan@apache.org>

Closes #8998 from harishreedharan/ss-log-error and squashes the following commits:

462b264 [Hari Shreedharan] Improve log message.
2733d94 [Hari Shreedharan] Minor change to warning message.
eaa48cc [Hari Shreedharan] Log a warning instead of failing the application if dynamic allocation is enabled.
725f090 [Hari Shreedharan] Add config parameter to allow dynamic allocation if the user explicitly sets it.
b3f9a95 [Hari Shreedharan] Disable dynamic allocation and kill app if it is enabled.
a4a5212 [Hari Shreedharan] [streaming] SPARK-10955. Disable dynamic allocation for Streaming applications.
---
 .../org/apache/spark/streaming/StreamingContext.scala    | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 94fea63f55b25..9b2632c229548 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -44,7 +44,7 @@ import org.apache.spark.streaming.dstream._
 import org.apache.spark.streaming.receiver.{ActorReceiver, ActorSupervisorStrategy, Receiver}
 import org.apache.spark.streaming.scheduler.{JobScheduler, StreamingListener}
 import org.apache.spark.streaming.ui.{StreamingJobProgressListener, StreamingTab}
-import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils}
+import org.apache.spark.util.{CallSite, ShutdownHookManager, ThreadUtils, Utils}
 
 /**
  * Main entry point for Spark Streaming functionality. It provides methods used to create
@@ -564,6 +564,13 @@ class StreamingContext private[streaming] (
           )
       }
     }
+
+    if (Utils.isDynamicAllocationEnabled(sc.conf)) {
+      logWarning("Dynamic Allocation is enabled for this application. " +
+        "Enabling Dynamic allocation for Spark Streaming applications can cause data loss if " +
+        "Write Ahead Log is not enabled for non-replayable sources like Flume. " +
+        "See the programming guide for details on how to enable the Write Ahead Log")
+    }
   }
 
   /**

From 67fbecbf32fced87d3accd2618fef2af9f44fae2 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 8 Oct 2015 21:44:59 -0700
Subject: [PATCH 011/139] [SPARK-10956] Common MemoryManager interface for
 storage and execution

This patch introduces a `MemoryManager` that is the central arbiter of how much memory to grant to storage and execution. This patch is primarily concerned only with refactoring while preserving the existing behavior as much as possible.

This is the first step away from the existing rigid separation of storage and execution memory, which has several major drawbacks discussed on the [issue](https://issues.apache.org/jira/browse/SPARK-10956). It is the precursor of a series of patches that will attempt to address those drawbacks.

Author: Andrew Or <andrew@databricks.com>
Author: Josh Rosen <joshrosen@databricks.com>
Author: andrewor14 <andrew@databricks.com>

Closes #9000 from andrewor14/memory-manager.
---
 .../scala/org/apache/spark/SparkEnv.scala     |  11 +-
 .../apache/spark/memory/MemoryManager.scala   | 117 ++++++++
 .../spark/memory/StaticMemoryManager.scala    | 202 +++++++++++++
 .../spark/shuffle/ShuffleMemoryManager.scala  |  69 +++--
 .../apache/spark/storage/BlockManager.scala   |  33 +--
 .../apache/spark/storage/MemoryStore.scala    | 272 +++++++++---------
 .../memory/StaticMemoryManagerSuite.scala     | 172 +++++++++++
 .../BlockManagerReplicationSuite.scala        |  29 +-
 .../spark/storage/BlockManagerSuite.scala     |  34 ++-
 .../execution/TestShuffleMemoryManager.scala  |  28 +-
 .../streaming/ReceivedBlockHandlerSuite.scala |  13 +-
 11 files changed, 752 insertions(+), 228 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
 create mode 100644 core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
 create mode 100644 core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index cfde27fb2e7d3..df3d84a1f08e9 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -30,6 +30,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.PythonWorkerFactory
 import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.metrics.MetricsSystem
+import org.apache.spark.memory.{MemoryManager, StaticMemoryManager}
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEndpoint, RpcEnv}
@@ -69,6 +70,8 @@ class SparkEnv (
     val httpFileServer: HttpFileServer,
     val sparkFilesDir: String,
     val metricsSystem: MetricsSystem,
+    // TODO: unify these *MemoryManager classes (SPARK-10984)
+    val memoryManager: MemoryManager,
     val shuffleMemoryManager: ShuffleMemoryManager,
     val executorMemoryManager: ExecutorMemoryManager,
     val outputCommitCoordinator: OutputCommitCoordinator,
@@ -332,7 +335,8 @@ object SparkEnv extends Logging {
     val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName)
     val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
 
-    val shuffleMemoryManager = ShuffleMemoryManager.create(conf, numUsableCores)
+    val memoryManager = new StaticMemoryManager(conf)
+    val shuffleMemoryManager = ShuffleMemoryManager.create(conf, memoryManager, numUsableCores)
 
     val blockTransferService = new NettyBlockTransferService(conf, securityManager, numUsableCores)
 
@@ -343,8 +347,8 @@ object SparkEnv extends Logging {
 
     // NB: blockManager is not valid until initialize() is called later.
     val blockManager = new BlockManager(executorId, rpcEnv, blockManagerMaster,
-      serializer, conf, mapOutputTracker, shuffleManager, blockTransferService, securityManager,
-      numUsableCores)
+      serializer, conf, memoryManager, mapOutputTracker, shuffleManager,
+      blockTransferService, securityManager, numUsableCores)
 
     val broadcastManager = new BroadcastManager(isDriver, conf, securityManager)
 
@@ -417,6 +421,7 @@ object SparkEnv extends Logging {
       httpFileServer,
       sparkFilesDir,
       metricsSystem,
+      memoryManager,
       shuffleMemoryManager,
       executorMemoryManager,
       outputCommitCoordinator,
diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
new file mode 100644
index 0000000000000..4bf73b696920d
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable
+
+import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
+
+
+/**
+ * An abstract memory manager that enforces how memory is shared between execution and storage.
+ *
+ * In this context, execution memory refers to that used for computation in shuffles, joins,
+ * sorts and aggregations, while storage memory refers to that used for caching and propagating
+ * internal data across the cluster. There exists one of these per JVM.
+ */
+private[spark] abstract class MemoryManager {
+
+  // The memory store used to evict cached blocks
+  private var _memoryStore: MemoryStore = _
+  protected def memoryStore: MemoryStore = {
+    if (_memoryStore == null) {
+      throw new IllegalArgumentException("memory store not initialized yet")
+    }
+    _memoryStore
+  }
+
+  /**
+   * Set the [[MemoryStore]] used by this manager to evict cached blocks.
+   * This must be set after construction due to initialization ordering constraints.
+   */
+  def setMemoryStore(store: MemoryStore): Unit = {
+    _memoryStore = store
+  }
+
+  /**
+   * Acquire N bytes of memory for execution.
+   * @return number of bytes successfully granted (<= N).
+   */
+  def acquireExecutionMemory(numBytes: Long): Long
+
+  /**
+   * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean
+
+  /**
+   * Acquire N bytes of memory to unroll the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  def acquireUnrollMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean
+
+  /**
+   * Release N bytes of execution memory.
+   */
+  def releaseExecutionMemory(numBytes: Long): Unit
+
+  /**
+   * Release N bytes of storage memory.
+   */
+  def releaseStorageMemory(numBytes: Long): Unit
+
+  /**
+   * Release all storage memory acquired.
+   */
+  def releaseStorageMemory(): Unit
+
+  /**
+   * Release N bytes of unroll memory.
+   */
+  def releaseUnrollMemory(numBytes: Long): Unit
+
+  /**
+   * Total available memory for execution, in bytes.
+   */
+  def maxExecutionMemory: Long
+
+  /**
+   * Total available memory for storage, in bytes.
+   */
+  def maxStorageMemory: Long
+
+  /**
+   * Execution memory currently in use, in bytes.
+   */
+  def executionMemoryUsed: Long
+
+  /**
+   * Storage memory currently in use, in bytes.
+   */
+  def storageMemoryUsed: Long
+
+}
diff --git a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
new file mode 100644
index 0000000000000..150445edb9578
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
@@ -0,0 +1,202 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable
+
+import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.storage.{BlockId, BlockStatus}
+
+
+/**
+ * A [[MemoryManager]] that statically partitions the heap space into disjoint regions.
+ *
+ * The sizes of the execution and storage regions are determined through
+ * `spark.shuffle.memoryFraction` and `spark.storage.memoryFraction` respectively. The two
+ * regions are cleanly separated such that neither usage can borrow memory from the other.
+ */
+private[spark] class StaticMemoryManager(
+    conf: SparkConf,
+    override val maxExecutionMemory: Long,
+    override val maxStorageMemory: Long)
+  extends MemoryManager with Logging {
+
+  // Max number of bytes worth of blocks to evict when unrolling
+  private val maxMemoryToEvictForUnroll: Long = {
+    (maxStorageMemory * conf.getDouble("spark.storage.unrollFraction", 0.2)).toLong
+  }
+
+  // Amount of execution / storage memory in use
+  // Accesses must be synchronized on `this`
+  private var _executionMemoryUsed: Long = 0
+  private var _storageMemoryUsed: Long = 0
+
+  def this(conf: SparkConf) {
+    this(
+      conf,
+      StaticMemoryManager.getMaxExecutionMemory(conf),
+      StaticMemoryManager.getMaxStorageMemory(conf))
+  }
+
+  /**
+   * Acquire N bytes of memory for execution.
+   * @return number of bytes successfully granted (<= N).
+   */
+  override def acquireExecutionMemory(numBytes: Long): Long = synchronized {
+    assert(_executionMemoryUsed <= maxExecutionMemory)
+    val bytesToGrant = math.min(numBytes, maxExecutionMemory - _executionMemoryUsed)
+    _executionMemoryUsed += bytesToGrant
+    bytesToGrant
+  }
+
+  /**
+   * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  override def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    acquireStorageMemory(blockId, numBytes, numBytes, evictedBlocks)
+  }
+
+  /**
+   * Acquire N bytes of memory to unroll the given block, evicting existing ones if necessary.
+   *
+   * This evicts at most M bytes worth of existing blocks, where M is a fraction of the storage
+   * space specified by `spark.storage.unrollFraction`. Blocks evicted in the process, if any,
+   * are added to `evictedBlocks`.
+   *
+   * @return whether all N bytes were successfully granted.
+   */
+  override def acquireUnrollMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    val currentUnrollMemory = memoryStore.currentUnrollMemory
+    val maxNumBytesToFree = math.max(0, maxMemoryToEvictForUnroll - currentUnrollMemory)
+    val numBytesToFree = math.min(numBytes, maxNumBytesToFree)
+    acquireStorageMemory(blockId, numBytes, numBytesToFree, evictedBlocks)
+  }
+
+  /**
+   * Acquire N bytes of storage memory for the given block, evicting existing ones if necessary.
+   *
+   * @param blockId the ID of the block we are acquiring storage memory for
+   * @param numBytesToAcquire the size of this block
+   * @param numBytesToFree the size of space to be freed through evicting blocks
+   * @param evictedBlocks a holder for blocks evicted in the process
+   * @return whether all N bytes were successfully granted.
+   */
+  private def acquireStorageMemory(
+      blockId: BlockId,
+      numBytesToAcquire: Long,
+      numBytesToFree: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    // Note: Keep this outside synchronized block to avoid potential deadlocks!
+    memoryStore.ensureFreeSpace(blockId, numBytesToFree, evictedBlocks)
+    synchronized {
+      assert(_storageMemoryUsed <= maxStorageMemory)
+      val enoughMemory = _storageMemoryUsed + numBytesToAcquire <= maxStorageMemory
+      if (enoughMemory) {
+        _storageMemoryUsed += numBytesToAcquire
+      }
+      enoughMemory
+    }
+  }
+
+  /**
+   * Release N bytes of execution memory.
+   */
+  override def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _executionMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of execution " +
+        s"memory when we only have ${_executionMemoryUsed} bytes")
+      _executionMemoryUsed = 0
+    } else {
+      _executionMemoryUsed -= numBytes
+    }
+  }
+
+  /**
+   * Release N bytes of storage memory.
+   */
+  override def releaseStorageMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _storageMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of storage " +
+        s"memory when we only have ${_storageMemoryUsed} bytes")
+      _storageMemoryUsed = 0
+    } else {
+      _storageMemoryUsed -= numBytes
+    }
+  }
+
+  /**
+   * Release all storage memory acquired.
+   */
+  override def releaseStorageMemory(): Unit = synchronized {
+    _storageMemoryUsed = 0
+  }
+
+  /**
+   * Release N bytes of unroll memory.
+   */
+  override def releaseUnrollMemory(numBytes: Long): Unit = {
+    releaseStorageMemory(numBytes)
+  }
+
+  /**
+   * Amount of execution memory currently in use, in bytes.
+   */
+  override def executionMemoryUsed: Long = synchronized {
+    _executionMemoryUsed
+  }
+
+  /**
+   * Amount of storage memory currently in use, in bytes.
+   */
+  override def storageMemoryUsed: Long = synchronized {
+    _storageMemoryUsed
+  }
+
+}
+
+
+private[spark] object StaticMemoryManager {
+
+  /**
+   * Return the total amount of memory available for the storage region, in bytes.
+   */
+  private def getMaxStorageMemory(conf: SparkConf): Long = {
+    val memoryFraction = conf.getDouble("spark.storage.memoryFraction", 0.6)
+    val safetyFraction = conf.getDouble("spark.storage.safetyFraction", 0.9)
+    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+  }
+
+
+  /**
+   * Return the total amount of memory available for the execution region, in bytes.
+   */
+  private def getMaxExecutionMemory(conf: SparkConf): Long = {
+    val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2)
+    val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8)
+    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+  }
+
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index 9839c7640cc63..bb64bb3f35df0 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -21,8 +21,9 @@ import scala.collection.mutable
 
 import com.google.common.annotations.VisibleForTesting
 
+import org.apache.spark._
+import org.apache.spark.memory.{StaticMemoryManager, MemoryManager}
 import org.apache.spark.unsafe.array.ByteArrayMethods
-import org.apache.spark.{Logging, SparkException, SparkConf, TaskContext}
 
 /**
  * Allocates a pool of memory to tasks for use in shuffle operations. Each disk-spilling
@@ -40,16 +41,17 @@ import org.apache.spark.{Logging, SparkException, SparkConf, TaskContext}
  *
  * Use `ShuffleMemoryManager.create()` factory method to create a new instance.
  *
- * @param maxMemory total amount of memory available for execution, in bytes.
+ * @param memoryManager the interface through which this manager acquires execution memory
  * @param pageSizeBytes number of bytes for each page, by default.
  */
 private[spark]
 class ShuffleMemoryManager protected (
-    val maxMemory: Long,
+    memoryManager: MemoryManager,
     val pageSizeBytes: Long)
   extends Logging {
 
   private val taskMemory = new mutable.HashMap[Long, Long]()  // taskAttemptId -> memory bytes
+  private val maxMemory = memoryManager.maxExecutionMemory
 
   private def currentTaskAttemptId(): Long = {
     // In case this is called on the driver, return an invalid task attempt id.
@@ -71,7 +73,7 @@ class ShuffleMemoryManager protected (
     // of active tasks, to let other tasks ramp down their memory in calls to tryToAcquire
     if (!taskMemory.contains(taskAttemptId)) {
       taskMemory(taskAttemptId) = 0L
-      notifyAll()  // Will later cause waiting tasks to wake up and check numThreads again
+      notifyAll()  // Will later cause waiting tasks to wake up and check numTasks again
     }
 
     // Keep looping until we're either sure that we don't want to grant this request (because this
@@ -85,46 +87,57 @@ class ShuffleMemoryManager protected (
       // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
       // don't let it be negative
       val maxToGrant = math.min(numBytes, math.max(0, (maxMemory / numActiveTasks) - curMem))
+      // Only give it as much memory as is free, which might be none if it reached 1 / numTasks
+      val toGrant = math.min(maxToGrant, freeMemory)
 
       if (curMem < maxMemory / (2 * numActiveTasks)) {
         // We want to let each task get at least 1 / (2 * numActiveTasks) before blocking;
         // if we can't give it this much now, wait for other tasks to free up memory
         // (this happens if older tasks allocated lots of memory before N grew)
         if (freeMemory >= math.min(maxToGrant, maxMemory / (2 * numActiveTasks) - curMem)) {
-          val toGrant = math.min(maxToGrant, freeMemory)
-          taskMemory(taskAttemptId) += toGrant
-          return toGrant
+          return acquire(toGrant)
         } else {
           logInfo(
             s"TID $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
           wait()
         }
       } else {
-        // Only give it as much memory as is free, which might be none if it reached 1 / numThreads
-        val toGrant = math.min(maxToGrant, freeMemory)
-        taskMemory(taskAttemptId) += toGrant
-        return toGrant
+        return acquire(toGrant)
       }
     }
     0L  // Never reached
   }
 
+  /**
+   * Acquire N bytes of execution memory from the memory manager for the current task.
+   * @return number of bytes actually acquired (<= N).
+   */
+  private def acquire(numBytes: Long): Long = synchronized {
+    val taskAttemptId = currentTaskAttemptId()
+    val acquired = memoryManager.acquireExecutionMemory(numBytes)
+    taskMemory(taskAttemptId) += acquired
+    acquired
+  }
+
   /** Release numBytes bytes for the current task. */
   def release(numBytes: Long): Unit = synchronized {
     val taskAttemptId = currentTaskAttemptId()
     val curMem = taskMemory.getOrElse(taskAttemptId, 0L)
     if (curMem < numBytes) {
       throw new SparkException(
-        s"Internal error: release called on ${numBytes} bytes but task only has ${curMem}")
+        s"Internal error: release called on $numBytes bytes but task only has $curMem")
     }
     taskMemory(taskAttemptId) -= numBytes
+    memoryManager.releaseExecutionMemory(numBytes)
     notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
   }
 
   /** Release all memory for the current task and mark it as inactive (e.g. when a task ends). */
   def releaseMemoryForThisTask(): Unit = synchronized {
     val taskAttemptId = currentTaskAttemptId()
-    taskMemory.remove(taskAttemptId)
+    taskMemory.remove(taskAttemptId).foreach { numBytes =>
+      memoryManager.releaseExecutionMemory(numBytes)
+    }
     notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
   }
 
@@ -138,30 +151,28 @@ class ShuffleMemoryManager protected (
 
 private[spark] object ShuffleMemoryManager {
 
-  def create(conf: SparkConf, numCores: Int): ShuffleMemoryManager = {
-    val maxMemory = ShuffleMemoryManager.getMaxMemory(conf)
+  def create(
+      conf: SparkConf,
+      memoryManager: MemoryManager,
+      numCores: Int): ShuffleMemoryManager = {
+    val maxMemory = memoryManager.maxExecutionMemory
     val pageSize = ShuffleMemoryManager.getPageSize(conf, maxMemory, numCores)
-    new ShuffleMemoryManager(maxMemory, pageSize)
+    new ShuffleMemoryManager(memoryManager, pageSize)
   }
 
+  /**
+   * Create a dummy [[ShuffleMemoryManager]] with the specified capacity and page size.
+   */
   def create(maxMemory: Long, pageSizeBytes: Long): ShuffleMemoryManager = {
-    new ShuffleMemoryManager(maxMemory, pageSizeBytes)
+    val conf = new SparkConf
+    val memoryManager = new StaticMemoryManager(
+      conf, maxExecutionMemory = maxMemory, maxStorageMemory = Long.MaxValue)
+    new ShuffleMemoryManager(memoryManager, pageSizeBytes)
   }
 
   @VisibleForTesting
   def createForTesting(maxMemory: Long): ShuffleMemoryManager = {
-    new ShuffleMemoryManager(maxMemory, 4 * 1024 * 1024)
-  }
-
-  /**
-   * Figure out the shuffle memory limit from a SparkConf. We currently have both a fraction
-   * of the memory pool and a safety factor since collections can sometimes grow bigger than
-   * the size we target before we estimate their sizes again.
-   */
-  private def getMaxMemory(conf: SparkConf): Long = {
-    val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2)
-    val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+    create(maxMemory, 4 * 1024 * 1024)
   }
 
   /**
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 47bd2ef8b2941..9f5bd2abbdc5d 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -31,6 +31,7 @@ import sun.nio.ch.DirectBuffer
 import org.apache.spark._
 import org.apache.spark.executor.{DataReadMethod, ShuffleWriteMetrics}
 import org.apache.spark.io.CompressionCodec
+import org.apache.spark.memory.MemoryManager
 import org.apache.spark.network._
 import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer}
 import org.apache.spark.network.netty.SparkTransportConf
@@ -64,8 +65,8 @@ private[spark] class BlockManager(
     rpcEnv: RpcEnv,
     val master: BlockManagerMaster,
     defaultSerializer: Serializer,
-    maxMemory: Long,
     val conf: SparkConf,
+    memoryManager: MemoryManager,
     mapOutputTracker: MapOutputTracker,
     shuffleManager: ShuffleManager,
     blockTransferService: BlockTransferService,
@@ -82,12 +83,15 @@ private[spark] class BlockManager(
 
   // Actual storage of where blocks are kept
   private var externalBlockStoreInitialized = false
-  private[spark] val memoryStore = new MemoryStore(this, maxMemory)
+  private[spark] val memoryStore = new MemoryStore(this, memoryManager)
   private[spark] val diskStore = new DiskStore(this, diskBlockManager)
   private[spark] lazy val externalBlockStore: ExternalBlockStore = {
     externalBlockStoreInitialized = true
     new ExternalBlockStore(this, executorId)
   }
+  memoryManager.setMemoryStore(memoryStore)
+
+  private val maxMemory = memoryManager.maxStorageMemory
 
   private[spark]
   val externalShuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
@@ -157,24 +161,6 @@ private[spark] class BlockManager(
    * loaded yet. */
   private lazy val compressionCodec: CompressionCodec = CompressionCodec.createCodec(conf)
 
-  /**
-   * Construct a BlockManager with a memory limit set based on system properties.
-   */
-  def this(
-      execId: String,
-      rpcEnv: RpcEnv,
-      master: BlockManagerMaster,
-      serializer: Serializer,
-      conf: SparkConf,
-      mapOutputTracker: MapOutputTracker,
-      shuffleManager: ShuffleManager,
-      blockTransferService: BlockTransferService,
-      securityManager: SecurityManager,
-      numUsableCores: Int) = {
-    this(execId, rpcEnv, master, serializer, BlockManager.getMaxMemory(conf),
-      conf, mapOutputTracker, shuffleManager, blockTransferService, securityManager, numUsableCores)
-  }
-
   /**
    * Initializes the BlockManager with the given appId. This is not performed in the constructor as
    * the appId may not be known at BlockManager instantiation time (in particular for the driver,
@@ -1267,13 +1253,6 @@ private[spark] class BlockManager(
 private[spark] object BlockManager extends Logging {
   private val ID_GENERATOR = new IdGenerator
 
-  /** Return the total amount of storage memory available. */
-  private def getMaxMemory(conf: SparkConf): Long = {
-    val memoryFraction = conf.getDouble("spark.storage.memoryFraction", 0.6)
-    val safetyFraction = conf.getDouble("spark.storage.safetyFraction", 0.9)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
-  }
-
   /**
    * Attempt to clean up a ByteBuffer if it is memory-mapped. This uses an *unsafe* Sun API that
    * might cause errors if one attempts to read from the unmapped buffer, but it's better than
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index 6f27f00307f8c..35c57b923c43a 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.TaskContext
+import org.apache.spark.memory.MemoryManager
 import org.apache.spark.util.{SizeEstimator, Utils}
 import org.apache.spark.util.collection.SizeTrackingVector
 
@@ -33,13 +34,12 @@ private case class MemoryEntry(value: Any, size: Long, deserialized: Boolean)
  * Stores blocks in memory, either as Arrays of deserialized Java objects or as
  * serialized ByteBuffers.
  */
-private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
+private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: MemoryManager)
   extends BlockStore(blockManager) {
 
   private val conf = blockManager.conf
   private val entries = new LinkedHashMap[BlockId, MemoryEntry](32, 0.75f, true)
-
-  @volatile private var currentMemory = 0L
+  private val maxMemory = memoryManager.maxStorageMemory
 
   // Ensure only one thread is putting, and if necessary, dropping blocks at any given time
   private val accountingLock = new Object
@@ -56,15 +56,6 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   // memory (SPARK-4777).
   private val pendingUnrollMemoryMap = mutable.HashMap[Long, Long]()
 
-  /**
-   * The amount of space ensured for unrolling values in memory, shared across all cores.
-   * This space is not reserved in advance, but allocated dynamically by dropping existing blocks.
-   */
-  private val maxUnrollMemory: Long = {
-    val unrollFraction = conf.getDouble("spark.storage.unrollFraction", 0.2)
-    (maxMemory * unrollFraction).toLong
-  }
-
   // Initial memory to request before unrolling any block
   private val unrollMemoryThreshold: Long =
     conf.getLong("spark.storage.unrollMemoryThreshold", 1024 * 1024)
@@ -77,8 +68,14 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
 
   logInfo("MemoryStore started with capacity %s".format(Utils.bytesToString(maxMemory)))
 
-  /** Free memory not occupied by existing blocks. Note that this does not include unroll memory. */
-  def freeMemory: Long = maxMemory - currentMemory
+  /** Total storage memory used including unroll memory, in bytes. */
+  private def memoryUsed: Long = memoryManager.storageMemoryUsed
+
+  /**
+   * Amount of storage memory, in bytes, used for caching blocks.
+   * This does not include memory used for unrolling.
+   */
+  private def blocksMemoryUsed: Long = memoryUsed - currentUnrollMemory
 
   override def getSize(blockId: BlockId): Long = {
     entries.synchronized {
@@ -94,8 +91,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       val values = blockManager.dataDeserialize(blockId, bytes)
       putIterator(blockId, values, level, returnValues = true)
     } else {
-      val putAttempt = tryToPut(blockId, bytes, bytes.limit, deserialized = false)
-      PutResult(bytes.limit(), Right(bytes.duplicate()), putAttempt.droppedBlocks)
+      val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+      tryToPut(blockId, bytes, bytes.limit, deserialized = false, droppedBlocks)
+      PutResult(bytes.limit(), Right(bytes.duplicate()), droppedBlocks)
     }
   }
 
@@ -108,15 +106,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   def putBytes(blockId: BlockId, size: Long, _bytes: () => ByteBuffer): PutResult = {
     // Work on a duplicate - since the original input might be used elsewhere.
     lazy val bytes = _bytes().duplicate().rewind().asInstanceOf[ByteBuffer]
-    val putAttempt = tryToPut(blockId, () => bytes, size, deserialized = false)
+    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val putSuccess = tryToPut(blockId, () => bytes, size, deserialized = false, droppedBlocks)
     val data =
-      if (putAttempt.success) {
+      if (putSuccess) {
         assert(bytes.limit == size)
         Right(bytes.duplicate())
       } else {
         null
       }
-    PutResult(size, data, putAttempt.droppedBlocks)
+    PutResult(size, data, droppedBlocks)
   }
 
   override def putArray(
@@ -124,14 +123,15 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       values: Array[Any],
       level: StorageLevel,
       returnValues: Boolean): PutResult = {
+    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     if (level.deserialized) {
       val sizeEstimate = SizeEstimator.estimate(values.asInstanceOf[AnyRef])
-      val putAttempt = tryToPut(blockId, values, sizeEstimate, deserialized = true)
-      PutResult(sizeEstimate, Left(values.iterator), putAttempt.droppedBlocks)
+      tryToPut(blockId, values, sizeEstimate, deserialized = true, droppedBlocks)
+      PutResult(sizeEstimate, Left(values.iterator), droppedBlocks)
     } else {
       val bytes = blockManager.dataSerialize(blockId, values.iterator)
-      val putAttempt = tryToPut(blockId, bytes, bytes.limit, deserialized = false)
-      PutResult(bytes.limit(), Right(bytes.duplicate()), putAttempt.droppedBlocks)
+      tryToPut(blockId, bytes, bytes.limit, deserialized = false, droppedBlocks)
+      PutResult(bytes.limit(), Right(bytes.duplicate()), droppedBlocks)
     }
   }
 
@@ -209,23 +209,22 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   }
 
   override def remove(blockId: BlockId): Boolean = {
-    entries.synchronized {
-      val entry = entries.remove(blockId)
-      if (entry != null) {
-        currentMemory -= entry.size
-        logDebug(s"Block $blockId of size ${entry.size} dropped from memory (free $freeMemory)")
-        true
-      } else {
-        false
-      }
+    val entry = entries.synchronized { entries.remove(blockId) }
+    if (entry != null) {
+      memoryManager.releaseStorageMemory(entry.size)
+      logDebug(s"Block $blockId of size ${entry.size} dropped " +
+        s"from memory (free ${maxMemory - blocksMemoryUsed})")
+      true
+    } else {
+      false
     }
   }
 
   override def clear() {
     entries.synchronized {
       entries.clear()
-      currentMemory = 0
     }
+    memoryManager.releaseStorageMemory()
     logInfo("MemoryStore cleared")
   }
 
@@ -265,7 +264,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     var vector = new SizeTrackingVector[Any]
 
     // Request enough memory to begin unrolling
-    keepUnrolling = reserveUnrollMemoryForThisTask(initialMemoryThreshold)
+    keepUnrolling = reserveUnrollMemoryForThisTask(blockId, initialMemoryThreshold, droppedBlocks)
 
     if (!keepUnrolling) {
       logWarning(s"Failed to reserve initial memory threshold of " +
@@ -281,20 +280,8 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
           val currentSize = vector.estimateSize()
           if (currentSize >= memoryThreshold) {
             val amountToRequest = (currentSize * memoryGrowthFactor - memoryThreshold).toLong
-            // Hold the accounting lock, in case another thread concurrently puts a block that
-            // takes up the unrolling space we just ensured here
-            accountingLock.synchronized {
-              if (!reserveUnrollMemoryForThisTask(amountToRequest)) {
-                // If the first request is not granted, try again after ensuring free space
-                // If there is still not enough space, give up and drop the partition
-                val spaceToEnsure = maxUnrollMemory - currentUnrollMemory
-                if (spaceToEnsure > 0) {
-                  val result = ensureFreeSpace(blockId, spaceToEnsure)
-                  droppedBlocks ++= result.droppedBlocks
-                }
-                keepUnrolling = reserveUnrollMemoryForThisTask(amountToRequest)
-              }
-            }
+            keepUnrolling = reserveUnrollMemoryForThisTask(
+              blockId, amountToRequest, droppedBlocks)
             // New threshold is currentSize * memoryGrowthFactor
             memoryThreshold += amountToRequest
           }
@@ -317,10 +304,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       // Otherwise, if we return an iterator, we release the memory reserved here
       // later when the task finishes.
       if (keepUnrolling) {
+        val taskAttemptId = currentTaskAttemptId()
         accountingLock.synchronized {
-          val amountToRelease = currentUnrollMemoryForThisTask - previousMemoryReserved
-          releaseUnrollMemoryForThisTask(amountToRelease)
-          reservePendingUnrollMemoryForThisTask(amountToRelease)
+          // Here, we transfer memory from unroll to pending unroll because we expect to cache this
+          // block in `tryToPut`. We do not release and re-acquire memory from the MemoryManager in
+          // order to avoid race conditions where another component steals the memory that we're
+          // trying to transfer.
+          val amountToTransferToPending = currentUnrollMemoryForThisTask - previousMemoryReserved
+          unrollMemoryMap(taskAttemptId) -= amountToTransferToPending
+          pendingUnrollMemoryMap(taskAttemptId) =
+            pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + amountToTransferToPending
         }
       }
     }
@@ -337,8 +330,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
       blockId: BlockId,
       value: Any,
       size: Long,
-      deserialized: Boolean): ResultWithDroppedBlocks = {
-    tryToPut(blockId, () => value, size, deserialized)
+      deserialized: Boolean,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    tryToPut(blockId, () => value, size, deserialized, droppedBlocks)
   }
 
   /**
@@ -354,13 +348,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * blocks to free memory for one block, another thread may use up the freed space for
    * another block.
    *
-   * Return whether put was successful, along with the blocks dropped in the process.
+   * All blocks evicted in the process, if any, will be added to `droppedBlocks`.
+   *
+   * @return whether put was successful.
    */
   private def tryToPut(
       blockId: BlockId,
       value: () => Any,
       size: Long,
-      deserialized: Boolean): ResultWithDroppedBlocks = {
+      deserialized: Boolean,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
 
     /* TODO: Its possible to optimize the locking by locking entries only when selecting blocks
      * to be dropped. Once the to-be-dropped blocks have been selected, and lock on entries has
@@ -368,24 +365,27 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
      * for freeing up more space for another block that needs to be put. Only then the actually
      * dropping of blocks (and writing to disk if necessary) can proceed in parallel. */
 
-    var putSuccess = false
-    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
-
     accountingLock.synchronized {
-      val freeSpaceResult = ensureFreeSpace(blockId, size)
-      val enoughFreeSpace = freeSpaceResult.success
-      droppedBlocks ++= freeSpaceResult.droppedBlocks
-
-      if (enoughFreeSpace) {
+      // Note: if we have previously unrolled this block successfully, then pending unroll
+      // memory should be non-zero. This is the amount that we already reserved during the
+      // unrolling process. In this case, we can just reuse this space to cache our block.
+      //
+      // Note: the StaticMemoryManager counts unroll memory as storage memory. Here, the
+      // synchronization on `accountingLock` guarantees that the release of unroll memory and
+      // acquisition of storage memory happens atomically. However, if storage memory is acquired
+      // outside of MemoryStore or if unroll memory is counted as execution memory, then we will
+      // have to revisit this assumption. See SPARK-10983 for more context.
+      releasePendingUnrollMemoryForThisTask()
+      val enoughMemory = memoryManager.acquireStorageMemory(blockId, size, droppedBlocks)
+      if (enoughMemory) {
+        // We acquired enough memory for the block, so go ahead and put it
         val entry = new MemoryEntry(value(), size, deserialized)
         entries.synchronized {
           entries.put(blockId, entry)
-          currentMemory += size
         }
         val valuesOrBytes = if (deserialized) "values" else "bytes"
         logInfo("Block %s stored as %s in memory (estimated size %s, free %s)".format(
-          blockId, valuesOrBytes, Utils.bytesToString(size), Utils.bytesToString(freeMemory)))
-        putSuccess = true
+          blockId, valuesOrBytes, Utils.bytesToString(size), Utils.bytesToString(blocksMemoryUsed)))
       } else {
         // Tell the block manager that we couldn't put it in memory so that it can drop it to
         // disk if the block allows disk storage.
@@ -397,10 +397,8 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         val droppedBlockStatus = blockManager.dropFromMemory(blockId, () => data)
         droppedBlockStatus.foreach { status => droppedBlocks += ((blockId, status)) }
       }
-      // Release the unroll memory used because we no longer need the underlying Array
-      releasePendingUnrollMemoryForThisTask()
+      enoughMemory
     }
-    ResultWithDroppedBlocks(putSuccess, droppedBlocks)
   }
 
   /**
@@ -409,40 +407,42 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * from the same RDD (which leads to a wasteful cyclic replacement pattern for RDDs that
    * don't fit into memory that we want to avoid).
    *
-   * Assume that `accountingLock` is held by the caller to ensure only one thread is dropping
-   * blocks. Otherwise, the freed space may fill up before the caller puts in their new value.
-   *
-   * Return whether there is enough free space, along with the blocks dropped in the process.
+   * @param blockId the ID of the block we are freeing space for
+   * @param space the size of this block
+   * @param droppedBlocks a holder for blocks evicted in the process
+   * @return whether there is enough free space.
    */
-  private def ensureFreeSpace(
-      blockIdToAdd: BlockId,
-      space: Long): ResultWithDroppedBlocks = {
-    logInfo(s"ensureFreeSpace($space) called with curMem=$currentMemory, maxMem=$maxMemory")
-
-    val droppedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+  private[spark] def ensureFreeSpace(
+      blockId: BlockId,
+      space: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    accountingLock.synchronized {
+      val freeMemory = maxMemory - memoryUsed
+      val rddToAdd = getRddId(blockId)
+      val selectedBlocks = new ArrayBuffer[BlockId]
+      var selectedMemory = 0L
 
-    if (space > maxMemory) {
-      logInfo(s"Will not store $blockIdToAdd as it is larger than our memory limit")
-      return ResultWithDroppedBlocks(success = false, droppedBlocks)
-    }
+      logInfo(s"Ensuring $space bytes of free space for block $blockId " +
+        s"(free: $freeMemory, max: $maxMemory)")
 
-    // Take into account the amount of memory currently occupied by unrolling blocks
-    // and minus the pending unroll memory for that block on current thread.
-    val taskAttemptId = currentTaskAttemptId()
-    val actualFreeMemory = freeMemory - currentUnrollMemory +
-      pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L)
+      // Fail fast if the block simply won't fit
+      if (space > maxMemory) {
+        logInfo(s"Will not store $blockId as the required space " +
+          s"($space bytes) than our memory limit ($maxMemory bytes)")
+        return false
+      }
 
-    if (actualFreeMemory < space) {
-      val rddToAdd = getRddId(blockIdToAdd)
-      val selectedBlocks = new ArrayBuffer[BlockId]
-      var selectedMemory = 0L
+      // No need to evict anything if there is already enough free space
+      if (freeMemory >= space) {
+        return true
+      }
 
       // This is synchronized to ensure that the set of entries is not changed
       // (because of getValue or getBytes) while traversing the iterator, as that
       // can lead to exceptions.
       entries.synchronized {
         val iterator = entries.entrySet().iterator()
-        while (actualFreeMemory + selectedMemory < space && iterator.hasNext) {
+        while (freeMemory + selectedMemory < space && iterator.hasNext) {
           val pair = iterator.next()
           val blockId = pair.getKey
           if (rddToAdd.isEmpty || rddToAdd != getRddId(blockId)) {
@@ -452,7 +452,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         }
       }
 
-      if (actualFreeMemory + selectedMemory >= space) {
+      if (freeMemory + selectedMemory >= space) {
         logInfo(s"${selectedBlocks.size} blocks selected for dropping")
         for (blockId <- selectedBlocks) {
           val entry = entries.synchronized { entries.get(blockId) }
@@ -469,14 +469,13 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
             droppedBlockStatus.foreach { status => droppedBlocks += ((blockId, status)) }
           }
         }
-        return ResultWithDroppedBlocks(success = true, droppedBlocks)
+        true
       } else {
-        logInfo(s"Will not store $blockIdToAdd as it would require dropping another block " +
+        logInfo(s"Will not store $blockId as it would require dropping another block " +
           "from the same RDD")
-        return ResultWithDroppedBlocks(success = false, droppedBlocks)
+        false
       }
     }
-    ResultWithDroppedBlocks(success = true, droppedBlocks)
   }
 
   override def contains(blockId: BlockId): Boolean = {
@@ -489,17 +488,21 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   }
 
   /**
-   * Reserve additional memory for unrolling blocks used by this task.
-   * Return whether the request is granted.
+   * Reserve memory for unrolling the given block for this task.
+   * @return whether the request is granted.
    */
-  def reserveUnrollMemoryForThisTask(memory: Long): Boolean = {
+  def reserveUnrollMemoryForThisTask(
+      blockId: BlockId,
+      memory: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
     accountingLock.synchronized {
-      val granted = freeMemory > currentUnrollMemory + memory
-      if (granted) {
+      // Note: all acquisitions of unroll memory must be synchronized on `accountingLock`
+      val success = memoryManager.acquireUnrollMemory(blockId, memory, droppedBlocks)
+      if (success) {
         val taskAttemptId = currentTaskAttemptId()
         unrollMemoryMap(taskAttemptId) = unrollMemoryMap.getOrElse(taskAttemptId, 0L) + memory
       }
-      granted
+      success
     }
   }
 
@@ -507,40 +510,38 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * Release memory used by this task for unrolling blocks.
    * If the amount is not specified, remove the current task's allocation altogether.
    */
-  def releaseUnrollMemoryForThisTask(memory: Long = -1L): Unit = {
+  def releaseUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
     accountingLock.synchronized {
-      if (memory < 0) {
-        unrollMemoryMap.remove(taskAttemptId)
-      } else {
-        unrollMemoryMap(taskAttemptId) = unrollMemoryMap.getOrElse(taskAttemptId, memory) - memory
-        // If this task claims no more unroll memory, release it completely
-        if (unrollMemoryMap(taskAttemptId) <= 0) {
-          unrollMemoryMap.remove(taskAttemptId)
+      if (unrollMemoryMap.contains(taskAttemptId)) {
+        val memoryToRelease = math.min(memory, unrollMemoryMap(taskAttemptId))
+        if (memoryToRelease > 0) {
+          unrollMemoryMap(taskAttemptId) -= memoryToRelease
+          if (unrollMemoryMap(taskAttemptId) == 0) {
+            unrollMemoryMap.remove(taskAttemptId)
+          }
+          memoryManager.releaseUnrollMemory(memoryToRelease)
         }
       }
     }
   }
 
-  /**
-   * Reserve the unroll memory of current unroll successful block used by this task
-   * until actually put the block into memory entry.
-   */
-  def reservePendingUnrollMemoryForThisTask(memory: Long): Unit = {
-    val taskAttemptId = currentTaskAttemptId()
-    accountingLock.synchronized {
-       pendingUnrollMemoryMap(taskAttemptId) =
-         pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + memory
-    }
-  }
-
   /**
    * Release pending unroll memory of current unroll successful block used by this task
    */
-  def releasePendingUnrollMemoryForThisTask(): Unit = {
+  def releasePendingUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
     accountingLock.synchronized {
-      pendingUnrollMemoryMap.remove(taskAttemptId)
+      if (pendingUnrollMemoryMap.contains(taskAttemptId)) {
+        val memoryToRelease = math.min(memory, pendingUnrollMemoryMap(taskAttemptId))
+        if (memoryToRelease > 0) {
+          pendingUnrollMemoryMap(taskAttemptId) -= memoryToRelease
+          if (pendingUnrollMemoryMap(taskAttemptId) == 0) {
+            pendingUnrollMemoryMap.remove(taskAttemptId)
+          }
+          memoryManager.releaseUnrollMemory(memoryToRelease)
+        }
+      }
     }
   }
 
@@ -561,19 +562,16 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
   /**
    * Return the number of tasks currently unrolling blocks.
    */
-  def numTasksUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
+  private def numTasksUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
 
   /**
    * Log information about current memory usage.
    */
-  def logMemoryUsage(): Unit = {
-    val blocksMemory = currentMemory
-    val unrollMemory = currentUnrollMemory
-    val totalMemory = blocksMemory + unrollMemory
+  private def logMemoryUsage(): Unit = {
     logInfo(
-      s"Memory use = ${Utils.bytesToString(blocksMemory)} (blocks) + " +
-      s"${Utils.bytesToString(unrollMemory)} (scratch space shared across " +
-      s"$numTasksUnrolling tasks(s)) = ${Utils.bytesToString(totalMemory)}. " +
+      s"Memory use = ${Utils.bytesToString(blocksMemoryUsed)} (blocks) + " +
+      s"${Utils.bytesToString(currentUnrollMemory)} (scratch space shared across " +
+      s"$numTasksUnrolling tasks(s)) = ${Utils.bytesToString(memoryUsed)}. " +
       s"Storage limit = ${Utils.bytesToString(maxMemory)}."
     )
   }
@@ -584,7 +582,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
    * @param blockId ID of the block we are trying to unroll.
    * @param finalVectorSize Final size of the vector before unrolling failed.
    */
-  def logUnrollFailureMessage(blockId: BlockId, finalVectorSize: Long): Unit = {
+  private def logUnrollFailureMessage(blockId: BlockId, finalVectorSize: Long): Unit = {
     logWarning(
       s"Not enough space to cache $blockId in memory! " +
       s"(computed ${Utils.bytesToString(finalVectorSize)} so far)"
@@ -592,7 +590,3 @@ private[spark] class MemoryStore(blockManager: BlockManager, maxMemory: Long)
     logMemoryUsage()
   }
 }
-
-private[spark] case class ResultWithDroppedBlocks(
-    success: Boolean,
-    droppedBlocks: Seq[(BlockId, BlockStatus)])
diff --git a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
new file mode 100644
index 0000000000000..c436a8b5c9f81
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.mockito.Mockito.{mock, reset, verify, when}
+import org.mockito.Matchers.{any, eq => meq}
+
+import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
+import org.apache.spark.{SparkConf, SparkFunSuite}
+
+
+class StaticMemoryManagerSuite extends SparkFunSuite {
+  private val conf = new SparkConf().set("spark.storage.unrollFraction", "0.4")
+
+  test("basic execution memory") {
+    val maxExecutionMem = 1000L
+    val (mm, _) = makeThings(maxExecutionMem, Long.MaxValue)
+    assert(mm.executionMemoryUsed === 0L)
+    assert(mm.acquireExecutionMemory(10L) === 10L)
+    assert(mm.executionMemoryUsed === 10L)
+    assert(mm.acquireExecutionMemory(100L) === 100L)
+    // Acquire up to the max
+    assert(mm.acquireExecutionMemory(1000L) === 890L)
+    assert(mm.executionMemoryUsed === maxExecutionMem)
+    assert(mm.acquireExecutionMemory(1L) === 0L)
+    assert(mm.executionMemoryUsed === maxExecutionMem)
+    mm.releaseExecutionMemory(800L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireExecutionMemory(1L) === 1L)
+    assert(mm.executionMemoryUsed === 201L)
+    // Release beyond what was acquired
+    mm.releaseExecutionMemory(maxExecutionMem)
+    assert(mm.executionMemoryUsed === 0L)
+  }
+
+  test("basic storage memory") {
+    val maxStorageMem = 1000L
+    val dummyBlock = TestBlockId("you can see the world you brought to live")
+    val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
+    // `ensureFreeSpace` should be called with the number of bytes requested
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 10L)
+    assert(mm.storageMemoryUsed === 10L)
+    assert(evictedBlocks.isEmpty)
+    assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assert(mm.storageMemoryUsed === 110L)
+    // Acquire up to the max, not granted
+    assert(!mm.acquireStorageMemory(dummyBlock, 1000L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1000L)
+    assert(mm.storageMemoryUsed === 110L)
+    assert(mm.acquireStorageMemory(dummyBlock, 890L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 890L)
+    assert(mm.storageMemoryUsed === 1000L)
+    assert(!mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.storageMemoryUsed === 1000L)
+    mm.releaseStorageMemory(800L)
+    assert(mm.storageMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.storageMemoryUsed === 201L)
+    mm.releaseStorageMemory()
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.storageMemoryUsed === 1L)
+    // Release beyond what was acquired
+    mm.releaseStorageMemory(100L)
+    assert(mm.storageMemoryUsed === 0L)
+  }
+
+  test("execution and storage isolation") {
+    val maxExecutionMem = 200L
+    val maxStorageMem = 1000L
+    val dummyBlock = TestBlockId("ain't nobody love like you do")
+    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val (mm, ms) = makeThings(maxExecutionMem, maxStorageMem)
+    // Only execution memory should increase
+    assert(mm.acquireExecutionMemory(100L) === 100L)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.executionMemoryUsed === 100L)
+    assert(mm.acquireExecutionMemory(1000L) === 100L)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Only storage memory should increase
+    assert(mm.acquireStorageMemory(dummyBlock, 50L, dummyBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 50L)
+    assert(mm.storageMemoryUsed === 50L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Only execution memory should be released
+    mm.releaseExecutionMemory(133L)
+    assert(mm.storageMemoryUsed === 50L)
+    assert(mm.executionMemoryUsed === 67L)
+    // Only storage memory should be released
+    mm.releaseStorageMemory()
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.executionMemoryUsed === 67L)
+  }
+
+  test("unroll memory") {
+    val maxStorageMem = 1000L
+    val dummyBlock = TestBlockId("lonely water")
+    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
+    assert(mm.acquireUnrollMemory(dummyBlock, 100L, dummyBlocks))
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assert(mm.storageMemoryUsed === 100L)
+    mm.releaseUnrollMemory(40L)
+    assert(mm.storageMemoryUsed === 60L)
+    when(ms.currentUnrollMemory).thenReturn(60L)
+    assert(mm.acquireUnrollMemory(dummyBlock, 500L, dummyBlocks))
+    // `spark.storage.unrollFraction` is 0.4, so the max unroll space is 400 bytes.
+    // Since we already occupy 60 bytes, we will try to ensure only 400 - 60 = 340 bytes.
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 340L)
+    assert(mm.storageMemoryUsed === 560L)
+    when(ms.currentUnrollMemory).thenReturn(560L)
+    assert(!mm.acquireUnrollMemory(dummyBlock, 800L, dummyBlocks))
+    assert(mm.storageMemoryUsed === 560L)
+    // We already have 560 bytes > the max unroll space of 400 bytes, so no bytes are freed
+    assertEnsureFreeSpaceCalled(ms, dummyBlock, 0L)
+    // Release beyond what was acquired
+    mm.releaseUnrollMemory(maxStorageMem)
+    assert(mm.storageMemoryUsed === 0L)
+  }
+
+  /**
+   * Make a [[StaticMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
+   */
+  private def makeThings(
+      maxExecutionMem: Long,
+      maxStorageMem: Long): (StaticMemoryManager, MemoryStore) = {
+    val mm = new StaticMemoryManager(
+      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem)
+    val ms = mock(classOf[MemoryStore])
+    mm.setMemoryStore(ms)
+    (mm, ms)
+  }
+
+  /**
+   * Assert that [[MemoryStore.ensureFreeSpace]] is called with the given parameters.
+   */
+  private def assertEnsureFreeSpaceCalled(
+      ms: MemoryStore,
+      blockId: BlockId,
+      numBytes: Long): Unit = {
+    verify(ms).ensureFreeSpace(meq(blockId), meq(numBytes: java.lang.Long), any())
+    reset(ms)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
index eb5af70d57aec..cc44c676b27ac 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerReplicationSuite.scala
@@ -29,6 +29,7 @@ import org.scalatest.concurrent.Eventually._
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
+import org.apache.spark.memory.StaticMemoryManager
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.KryoSerializer
@@ -39,29 +40,31 @@ import org.apache.spark.storage.StorageLevel._
 class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with BeforeAndAfter {
 
   private val conf = new SparkConf(false).set("spark.app.id", "test")
-  var rpcEnv: RpcEnv = null
-  var master: BlockManagerMaster = null
-  val securityMgr = new SecurityManager(conf)
-  val mapOutputTracker = new MapOutputTrackerMaster(conf)
-  val shuffleManager = new HashShuffleManager(conf)
+  private var rpcEnv: RpcEnv = null
+  private var master: BlockManagerMaster = null
+  private val securityMgr = new SecurityManager(conf)
+  private val mapOutputTracker = new MapOutputTrackerMaster(conf)
+  private val shuffleManager = new HashShuffleManager(conf)
 
   // List of block manager created during an unit test, so that all of the them can be stopped
   // after the unit test.
-  val allStores = new ArrayBuffer[BlockManager]
+  private val allStores = new ArrayBuffer[BlockManager]
 
   // Reuse a serializer across tests to avoid creating a new thread-local buffer on each test
   conf.set("spark.kryoserializer.buffer", "1m")
-  val serializer = new KryoSerializer(conf)
+  private val serializer = new KryoSerializer(conf)
 
   // Implicitly convert strings to BlockIds for test clarity.
-  implicit def StringToBlockId(value: String): BlockId = new TestBlockId(value)
+  private implicit def StringToBlockId(value: String): BlockId = new TestBlockId(value)
 
   private def makeBlockManager(
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val store = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
-      mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
+    val store = new BlockManager(name, rpcEnv, master, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    memManager.setMemoryStore(store.memoryStore)
     store.initialize("app-id")
     allStores += store
     store
@@ -258,8 +261,10 @@ class BlockManagerReplicationSuite extends SparkFunSuite with Matchers with Befo
     val failableTransfer = mock(classOf[BlockTransferService]) // this wont actually work
     when(failableTransfer.hostName).thenReturn("some-hostname")
     when(failableTransfer.port).thenReturn(1000)
-    val failableStore = new BlockManager("failable-store", rpcEnv, master, serializer,
-      10000, conf, mapOutputTracker, shuffleManager, failableTransfer, securityMgr, 0)
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, 10000)
+    val failableStore = new BlockManager("failable-store", rpcEnv, master, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, failableTransfer, securityMgr, 0)
+    memManager.setMemoryStore(failableStore.memoryStore)
     failableStore.initialize("app-id")
     allStores += failableStore // so that this gets stopped after test
     assert(master.getPeers(store.blockManagerId).toSet === Set(failableStore.blockManagerId))
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 34bb4952e7246..f3fab33ca2e31 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -34,6 +34,7 @@ import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark._
 import org.apache.spark.executor.DataReadMethod
+import org.apache.spark.memory.StaticMemoryManager
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -67,10 +68,12 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
       maxMem: Long,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val manager = new BlockManager(name, rpcEnv, master, serializer, maxMem, conf,
-      mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
-    manager.initialize("app-id")
-    manager
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
+    val blockManager = new BlockManager(name, rpcEnv, master, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    memManager.setMemoryStore(blockManager.memoryStore)
+    blockManager.initialize("app-id")
+    blockManager
   }
 
   override def beforeEach(): Unit = {
@@ -820,9 +823,11 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
   test("block store put failure") {
     // Use Java serializer so we can create an unserializable error.
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
+    val memoryManager = new StaticMemoryManager(conf, Long.MaxValue, 1200)
     store = new BlockManager(SparkContext.DRIVER_IDENTIFIER, rpcEnv, master,
-      new JavaSerializer(conf), 1200, conf, mapOutputTracker, shuffleManager, transfer, securityMgr,
-      0)
+      new JavaSerializer(conf), conf, memoryManager, mapOutputTracker,
+      shuffleManager, transfer, securityMgr, 0)
+    memoryManager.setMemoryStore(store.memoryStore)
 
     // The put should fail since a1 is not serializable.
     class UnserializableClass
@@ -1043,14 +1048,19 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     assert(memoryStore.currentUnrollMemory === 0)
     assert(memoryStore.currentUnrollMemoryForThisTask === 0)
 
+    def reserveUnrollMemoryForThisTask(memory: Long): Boolean = {
+      memoryStore.reserveUnrollMemoryForThisTask(
+        TestBlockId(""), memory, new ArrayBuffer[(BlockId, BlockStatus)])
+    }
+
     // Reserve
-    memoryStore.reserveUnrollMemoryForThisTask(100)
+    assert(reserveUnrollMemoryForThisTask(100))
     assert(memoryStore.currentUnrollMemoryForThisTask === 100)
-    memoryStore.reserveUnrollMemoryForThisTask(200)
+    assert(reserveUnrollMemoryForThisTask(200))
     assert(memoryStore.currentUnrollMemoryForThisTask === 300)
-    memoryStore.reserveUnrollMemoryForThisTask(500)
+    assert(reserveUnrollMemoryForThisTask(500))
     assert(memoryStore.currentUnrollMemoryForThisTask === 800)
-    memoryStore.reserveUnrollMemoryForThisTask(1000000)
+    assert(!reserveUnrollMemoryForThisTask(1000000))
     assert(memoryStore.currentUnrollMemoryForThisTask === 800) // not granted
     // Release
     memoryStore.releaseUnrollMemoryForThisTask(100)
@@ -1058,9 +1068,9 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     memoryStore.releaseUnrollMemoryForThisTask(100)
     assert(memoryStore.currentUnrollMemoryForThisTask === 600)
     // Reserve again
-    memoryStore.reserveUnrollMemoryForThisTask(4400)
+    assert(reserveUnrollMemoryForThisTask(4400))
     assert(memoryStore.currentUnrollMemoryForThisTask === 5000)
-    memoryStore.reserveUnrollMemoryForThisTask(20000)
+    assert(!reserveUnrollMemoryForThisTask(20000))
     assert(memoryStore.currentUnrollMemoryForThisTask === 5000) // not granted
     // Release again
     memoryStore.releaseUnrollMemoryForThisTask(1000)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
index 48c3938ff87ba..ff65d7bdf8b92 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
@@ -17,12 +17,18 @@
 
 package org.apache.spark.sql.execution
 
+import scala.collection.mutable
+
+import org.apache.spark.memory.MemoryManager
 import org.apache.spark.shuffle.ShuffleMemoryManager
+import org.apache.spark.storage.{BlockId, BlockStatus}
+
 
 /**
  * A [[ShuffleMemoryManager]] that can be controlled to run out of memory.
  */
-class TestShuffleMemoryManager extends ShuffleMemoryManager(Long.MaxValue, 4 * 1024 * 1024) {
+class TestShuffleMemoryManager
+  extends ShuffleMemoryManager(new GrantEverythingMemoryManager, 4 * 1024 * 1024) {
   private var oom = false
 
   override def tryToAcquire(numBytes: Long): Long = {
@@ -49,3 +55,23 @@ class TestShuffleMemoryManager extends ShuffleMemoryManager(Long.MaxValue, 4 * 1
     oom = true
   }
 }
+
+private class GrantEverythingMemoryManager extends MemoryManager {
+  override def acquireExecutionMemory(numBytes: Long): Long = numBytes
+  override def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
+  override def acquireUnrollMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
+  override def releaseExecutionMemory(numBytes: Long): Unit = { }
+  override def releaseStorageMemory(numBytes: Long): Unit = { }
+  override def releaseStorageMemory(): Unit = { }
+  override def releaseUnrollMemory(numBytes: Long): Unit = { }
+  override def maxExecutionMemory: Long = Long.MaxValue
+  override def maxStorageMemory: Long = Long.MaxValue
+  override def executionMemoryUsed: Long = Long.MaxValue
+  override def storageMemoryUsed: Long = Long.MaxValue
+}
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
index 13cfe29d7b304..b2b6848719639 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceivedBlockHandlerSuite.scala
@@ -29,6 +29,7 @@ import org.scalatest.{BeforeAndAfter, Matchers}
 import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
+import org.apache.spark.memory.StaticMemoryManager
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.LiveListenerBus
@@ -253,12 +254,14 @@ class ReceivedBlockHandlerSuite
       maxMem: Long,
       conf: SparkConf,
       name: String = SparkContext.DRIVER_IDENTIFIER): BlockManager = {
+    val memManager = new StaticMemoryManager(conf, Long.MaxValue, maxMem)
     val transfer = new NettyBlockTransferService(conf, securityMgr, numCores = 1)
-    val manager = new BlockManager(name, rpcEnv, blockManagerMaster, serializer, maxMem, conf,
-      mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
-    manager.initialize("app-id")
-    blockManagerBuffer += manager
-    manager
+    val blockManager = new BlockManager(name, rpcEnv, blockManagerMaster, serializer, conf,
+      memManager, mapOutputTracker, shuffleManager, transfer, securityMgr, 0)
+    memManager.setMemoryStore(blockManager.memoryStore)
+    blockManager.initialize("app-id")
+    blockManagerBuffer += blockManager
+    blockManager
   }
 
   /**

From 5410747a84e9be1cea44159dfc2216d5e0728ab4 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <bjcutler@us.ibm.com>
Date: Thu, 8 Oct 2015 22:21:07 -0700
Subject: [PATCH 012/139] [SPARK-10959] [PYSPARK]
 StreamingLogisticRegressionWithSGD does not train with given regParam and
 convergenceTol parameters

These params were being passed into the StreamingLogisticRegressionWithSGD constructor, but not transferred to the call for model training.  Same with StreamingLinearRegressionWithSGD.  I added the params as named arguments to the call and also fixed the intercept parameter, which was being passed as regularization value.

Author: Bryan Cutler <bjcutler@us.ibm.com>

Closes #9002 from BryanCutler/StreamingSGD-convergenceTol-bug-10959.
---
 python/pyspark/mllib/classification.py | 3 ++-
 python/pyspark/mllib/regression.py     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index cb4ee83678081..b77754500bded 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -639,7 +639,8 @@ def update(rdd):
             if not rdd.isEmpty():
                 self._model = LogisticRegressionWithSGD.train(
                     rdd, self.numIterations, self.stepSize,
-                    self.miniBatchFraction, self._model.weights)
+                    self.miniBatchFraction, self._model.weights,
+                    regParam=self.regParam, convergenceTol=self.convergenceTol)
 
         dstream.foreachRDD(update)
 
diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py
index 256b7537fef6b..961b5e80b013c 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -679,7 +679,7 @@ def update(rdd):
                 self._model = LinearRegressionWithSGD.train(
                     rdd, self.numIterations, self.stepSize,
                     self.miniBatchFraction, self._model.weights,
-                    self._model.intercept)
+                    intercept=self._model.intercept, convergenceTol=self.convergenceTol)
 
         dstream.foreachRDD(update)
 

From 5994cfe81271a39294aa29fd47aa94c99aa56743 Mon Sep 17 00:00:00 2001
From: Nick Pritchard <nicholas.pritchard@falkonry.com>
Date: Thu, 8 Oct 2015 22:22:20 -0700
Subject: [PATCH 013/139] [SPARK-10875] [MLLIB] Computed covariance matrix
 should be symmetric

Compute upper triangular values of the covariance matrix, then copy to lower triangular values.

Author: Nick Pritchard <nicholas.pritchard@falkonry.com>

Closes #8940 from pnpritchard/SPARK-10875.
---
 .../mllib/linalg/distributed/RowMatrix.scala   |  6 ++++--
 .../linalg/distributed/RowMatrixSuite.scala    | 18 ++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 7c7d900af3d5a..b8a7adceb15b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -357,9 +357,11 @@ class RowMatrix @Since("1.0.0") (
     var alpha = 0.0
     while (i < n) {
       alpha = m / m1 * mean(i)
-      j = 0
+      j = i
       while (j < n) {
-        G(i, j) = G(i, j) / m1 - alpha * mean(j)
+        val Gij = G(i, j) / m1 - alpha * mean(j)
+        G(i, j) = Gij
+        G(j, i) = Gij
         j += 1
       }
       i += 1
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
index 283ffec1d49d7..4abb98fb6fe4e 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala
@@ -24,6 +24,7 @@ import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, s
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.mllib.linalg.{Matrices, Vectors, Vector}
+import org.apache.spark.mllib.random.RandomRDDs
 import org.apache.spark.mllib.util.{LocalClusterSparkContext, MLlibTestSparkContext}
 
 class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
@@ -255,6 +256,23 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
       assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]])))
     }
   }
+
+  test("compute covariance") {
+    for (mat <- Seq(denseMat, sparseMat)) {
+      val result = mat.computeCovariance()
+      val expected = breeze.linalg.cov(mat.toBreeze())
+      assert(closeToZero(abs(expected) - abs(result.toBreeze.asInstanceOf[BDM[Double]])))
+    }
+  }
+
+  test("covariance matrix is symmetric (SPARK-10875)") {
+    val rdd = RandomRDDs.normalVectorRDD(sc, 100, 10, 0, 0)
+    val matrix = new RowMatrix(rdd)
+    val cov = matrix.computeCovariance()
+    for (i <- 0 until cov.numRows; j <- 0 until i) {
+      assert(cov(i, j) === cov(j, i))
+    }
+  }
 }
 
 class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext {

From 70f44ad2d836236c74e1336a7368982d5fe3abff Mon Sep 17 00:00:00 2001
From: Rerngvit Yanggratoke <rerngvit@kth.se>
Date: Fri, 9 Oct 2015 09:36:40 -0700
Subject: [PATCH 014/139] [SPARK-10905] [SPARKR] Export freqItems() for
 DataFrameStatFunctions

[SPARK-10905][SparkR]: Export freqItems() for DataFrameStatFunctions
- Add function (together with roxygen2 doc) to DataFrame.R and generics.R
- Expose the function in NAMESPACE
- Add unit test for the function

Author: Rerngvit Yanggratoke <rerngvit@kth.se>

Closes #8962 from rerngvit/SPARK-10905.
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/R/stats.R                  | 27 +++++++++++++++++++++++++++
 R/pkg/inst/tests/test_sparkSQL.R | 21 +++++++++++++++++++++
 4 files changed, 53 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 9aad35469bbb7..255be2e76ff49 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -40,6 +40,7 @@ exportMethods("arrange",
               "fillna",
               "filter",
               "first",
+              "freqItems",
               "group_by",
               "groupBy",
               "head",
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e9086fdbd18c6..c4474131804bb 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -63,6 +63,10 @@ setGeneric("countByValue", function(x) { standardGeneric("countByValue") })
 # @export
 setGeneric("crosstab", function(x, col1, col2) { standardGeneric("crosstab") })
 
+# @rdname statfunctions
+# @export
+setGeneric("freqItems", function(x, cols, support = 0.01) { standardGeneric("freqItems") })
+
 # @rdname distinct
 # @export
 setGeneric("distinct", function(x, numPartitions = 1) { standardGeneric("distinct") })
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 06382d55d086e..4928cf4d4367d 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -100,3 +100,30 @@ setMethod("corr",
             statFunctions <- callJMethod(x@sdf, "stat")
             callJMethod(statFunctions, "corr", col1, col2, method)
           })
+
+#' freqItems
+#'
+#' Finding frequent items for columns, possibly with false positives.
+#' Using the frequent element count algorithm described in
+#' \url{http://dx.doi.org/10.1145/762471.762473}, proposed by Karp, Schenker, and Papadimitriou.
+#'
+#' @param x A SparkSQL DataFrame.
+#' @param cols A vector column names to search frequent items in.
+#' @param support (Optional) The minimum frequency for an item to be considered `frequent`. 
+#'                Should be greater than 1e-4. Default support = 0.01.
+#' @return a local R data.frame with the frequent items in each column
+#'
+#' @rdname statfunctions
+#' @name freqItems
+#' @export
+#' @examples
+#' \dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' fi = freqItems(df, c("title", "gender"))
+#' }
+setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
+          function(x, cols, support = 0.01) {
+            statFunctions <- callJMethod(x@sdf, "stat")
+            sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
+            collect(dataFrame(sct))
+          })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e85de2507085c..4804ecf177341 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1350,6 +1350,27 @@ test_that("cov() and corr() on a DataFrame", {
   expect_true(abs(result - 1.0) < 1e-12)
 })
 
+test_that("freqItems() on a DataFrame", {
+  input <- 1:1000
+  rdf <- data.frame(numbers = input, letters = as.character(input),
+                    negDoubles = input * -1.0, stringsAsFactors = F)
+  rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
+  df <- createDataFrame(sqlContext, rdf)
+  multiColResults <- freqItems(df, c("numbers", "letters"), support=0.1)
+  expect_true(1 %in% multiColResults$numbers[[1]])
+  expect_true("1" %in% multiColResults$letters[[1]])
+  singleColResult <- freqItems(df, "negDoubles", support=0.1)
+  expect_true(-1 %in% head(singleColResult$negDoubles)[[1]])
+
+  l <- lapply(c(0:99), function(i) {
+    if (i %% 2 == 0) { list(1L, -1.0) }
+    else { list(i, i * -1.0) }})
+  df <- createDataFrame(sqlContext, l, c("a", "b"))
+  result <- freqItems(df, c("a", "b"), 0.4)
+  expect_identical(result[[1]], list(list(1L, 99L)))
+  expect_identical(result[[2]], list(list(-1, -99)))
+})
+
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
   expect_equal(grepl("Table Not Found: blah", retError), TRUE)

From 015f7ef503d5544f79512b6333326749a1f0c48b Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Fri, 9 Oct 2015 15:28:09 -0500
Subject: [PATCH 015/139] [SPARK-8673] [LAUNCHER] API and infrastructure for
 communicating with child apps.

This change adds an API that encapsulates information about an app
launched using the library. It also creates a socket-based communication
layer for apps that are launched as child processes; the launching
application listens for connections from launched apps, and once
communication is established, the channel can be used to send updates
to the launching app, or to send commands to the child app.

The change also includes hooks for local, standalone/client and yarn
masters.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #7052 from vanzin/SPARK-8673.
---
 .../spark/launcher/LauncherBackend.scala      | 119 ++++++
 .../cluster/SparkDeploySchedulerBackend.scala |  35 +-
 .../spark/scheduler/local/LocalBackend.scala  |  19 +-
 .../spark/launcher/SparkLauncherSuite.java    |  39 +-
 core/src/test/resources/log4j.properties      |  11 +-
 .../spark/launcher/LauncherBackendSuite.scala |  81 +++++
 launcher/pom.xml                              |   5 +
 .../launcher/AbstractCommandBuilder.java      |  38 +-
 .../spark/launcher/ChildProcAppHandle.java    | 159 ++++++++
 .../spark/launcher/LauncherConnection.java    | 110 ++++++
 .../spark/launcher/LauncherProtocol.java      |  93 +++++
 .../apache/spark/launcher/LauncherServer.java | 341 ++++++++++++++++++
 .../spark/launcher/NamedThreadFactory.java    |  40 ++
 .../spark/launcher/OutputRedirector.java      |  78 ++++
 .../apache/spark/launcher/SparkAppHandle.java | 126 +++++++
 .../apache/spark/launcher/SparkLauncher.java  | 106 +++++-
 .../launcher/SparkSubmitCommandBuilder.java   |  22 +-
 .../apache/spark/launcher/package-info.java   |  38 +-
 .../org/apache/spark/launcher/BaseSuite.java  |  32 ++
 .../spark/launcher/LauncherServerSuite.java   | 188 ++++++++++
 .../SparkSubmitCommandBuilderSuite.java       |   4 +-
 .../SparkSubmitOptionParserSuite.java         |   2 +-
 launcher/src/test/resources/log4j.properties  |  13 +-
 .../org/apache/spark/deploy/yarn/Client.scala |  43 ++-
 .../cluster/YarnClientSchedulerBackend.scala  |  10 +
 yarn/src/test/resources/log4j.properties      |   7 +-
 .../deploy/yarn/BaseYarnClusterSuite.scala    | 127 ++++---
 .../spark/deploy/yarn/YarnClusterSuite.scala  |  76 +++-
 .../yarn/YarnShuffleIntegrationSuite.scala    |   4 +-
 29 files changed, 1820 insertions(+), 146 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
 create mode 100644 core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
 create mode 100644 launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
 create mode 100644 launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
 create mode 100644 launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java

diff --git a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
new file mode 100644
index 0000000000000..3ea984c501e02
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher
+
+import java.net.{InetAddress, Socket}
+
+import org.apache.spark.SPARK_VERSION
+import org.apache.spark.launcher.LauncherProtocol._
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * A class that can be used to talk to a launcher server. Users should extend this class to
+ * provide implementation for the abstract methods.
+ *
+ * See `LauncherServer` for an explanation of how launcher communication works.
+ */
+private[spark] abstract class LauncherBackend {
+
+  private var clientThread: Thread = _
+  private var connection: BackendConnection = _
+  private var lastState: SparkAppHandle.State = _
+  @volatile private var _isConnected = false
+
+  def connect(): Unit = {
+    val port = sys.env.get(LauncherProtocol.ENV_LAUNCHER_PORT).map(_.toInt)
+    val secret = sys.env.get(LauncherProtocol.ENV_LAUNCHER_SECRET)
+    if (port != None && secret != None) {
+      val s = new Socket(InetAddress.getLoopbackAddress(), port.get)
+      connection = new BackendConnection(s)
+      connection.send(new Hello(secret.get, SPARK_VERSION))
+      clientThread = LauncherBackend.threadFactory.newThread(connection)
+      clientThread.start()
+      _isConnected = true
+    }
+  }
+
+  def close(): Unit = {
+    if (connection != null) {
+      try {
+        connection.close()
+      } finally {
+        if (clientThread != null) {
+          clientThread.join()
+        }
+      }
+    }
+  }
+
+  def setAppId(appId: String): Unit = {
+    if (connection != null) {
+      connection.send(new SetAppId(appId))
+    }
+  }
+
+  def setState(state: SparkAppHandle.State): Unit = {
+    if (connection != null && lastState != state) {
+      connection.send(new SetState(state))
+      lastState = state
+    }
+  }
+
+  /** Return whether the launcher handle is still connected to this backend. */
+  def isConnected(): Boolean = _isConnected
+
+  /**
+   * Implementations should provide this method, which should try to stop the application
+   * as gracefully as possible.
+   */
+  protected def onStopRequest(): Unit
+
+  /**
+   * Callback for when the launcher handle disconnects from this backend.
+   */
+  protected def onDisconnected() : Unit = { }
+
+
+  private class BackendConnection(s: Socket) extends LauncherConnection(s) {
+
+    override protected def handle(m: Message): Unit = m match {
+      case _: Stop =>
+        onStopRequest()
+
+      case _ =>
+        throw new IllegalArgumentException(s"Unexpected message type: ${m.getClass().getName()}")
+    }
+
+    override def close(): Unit = {
+      try {
+        super.close()
+      } finally {
+        onDisconnected()
+        _isConnected = false
+      }
+    }
+
+  }
+
+}
+
+private object LauncherBackend {
+
+  val threadFactory = ThreadUtils.namedThreadFactory("LauncherBackend")
+
+}
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
index 27491ecf8b97d..2625c3e7ac718 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -23,6 +23,7 @@ import org.apache.spark.rpc.RpcAddress
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv}
 import org.apache.spark.deploy.{ApplicationDescription, Command}
 import org.apache.spark.deploy.client.{AppClient, AppClientListener}
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle}
 import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
@@ -36,6 +37,9 @@ private[spark] class SparkDeploySchedulerBackend(
 
   private var client: AppClient = null
   private var stopping = false
+  private val launcherBackend = new LauncherBackend() {
+    override protected def onStopRequest(): Unit = stop(SparkAppHandle.State.KILLED)
+  }
 
   @volatile var shutdownCallback: SparkDeploySchedulerBackend => Unit = _
   @volatile private var appId: String = _
@@ -47,6 +51,7 @@ private[spark] class SparkDeploySchedulerBackend(
 
   override def start() {
     super.start()
+    launcherBackend.connect()
 
     // The endpoint for executors to talk to us
     val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
@@ -87,24 +92,20 @@ private[spark] class SparkDeploySchedulerBackend(
       command, appUIAddress, sc.eventLogDir, sc.eventLogCodec, coresPerExecutor)
     client = new AppClient(sc.env.rpcEnv, masters, appDesc, this, conf)
     client.start()
+    launcherBackend.setState(SparkAppHandle.State.SUBMITTED)
     waitForRegistration()
+    launcherBackend.setState(SparkAppHandle.State.RUNNING)
   }
 
-  override def stop() {
-    stopping = true
-    super.stop()
-    client.stop()
-
-    val callback = shutdownCallback
-    if (callback != null) {
-      callback(this)
-    }
+  override def stop(): Unit = synchronized {
+    stop(SparkAppHandle.State.FINISHED)
   }
 
   override def connected(appId: String) {
     logInfo("Connected to Spark cluster with app ID " + appId)
     this.appId = appId
     notifyContext()
+    launcherBackend.setAppId(appId)
   }
 
   override def disconnected() {
@@ -117,6 +118,7 @@ private[spark] class SparkDeploySchedulerBackend(
   override def dead(reason: String) {
     notifyContext()
     if (!stopping) {
+      launcherBackend.setState(SparkAppHandle.State.KILLED)
       logError("Application has been killed. Reason: " + reason)
       try {
         scheduler.error(reason)
@@ -188,4 +190,19 @@ private[spark] class SparkDeploySchedulerBackend(
     registrationBarrier.release()
   }
 
+  private def stop(finalState: SparkAppHandle.State): Unit = synchronized {
+    stopping = true
+
+    launcherBackend.setState(finalState)
+    launcherBackend.close()
+
+    super.stop()
+    client.stop()
+
+    val callback = shutdownCallback
+    if (callback != null) {
+      callback(this)
+    }
+  }
+
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
index 4d48fcfea44e7..c633d860ae6e5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/local/LocalBackend.scala
@@ -24,6 +24,7 @@ import java.nio.ByteBuffer
 import org.apache.spark.{Logging, SparkConf, SparkContext, SparkEnv, TaskState}
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.executor.{Executor, ExecutorBackend}
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle}
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler._
 import org.apache.spark.scheduler.cluster.ExecutorInfo
@@ -103,6 +104,9 @@ private[spark] class LocalBackend(
   private var localEndpoint: RpcEndpointRef = null
   private val userClassPath = getUserClasspath(conf)
   private val listenerBus = scheduler.sc.listenerBus
+  private val launcherBackend = new LauncherBackend() {
+    override def onStopRequest(): Unit = stop(SparkAppHandle.State.KILLED)
+  }
 
   /**
    * Returns a list of URLs representing the user classpath.
@@ -114,6 +118,8 @@ private[spark] class LocalBackend(
     userClassPathStr.map(_.split(File.pathSeparator)).toSeq.flatten.map(new File(_).toURI.toURL)
   }
 
+  launcherBackend.connect()
+
   override def start() {
     val rpcEnv = SparkEnv.get.rpcEnv
     val executorEndpoint = new LocalEndpoint(rpcEnv, userClassPath, scheduler, this, totalCores)
@@ -122,10 +128,12 @@ private[spark] class LocalBackend(
       System.currentTimeMillis,
       executorEndpoint.localExecutorId,
       new ExecutorInfo(executorEndpoint.localExecutorHostname, totalCores, Map.empty)))
+    launcherBackend.setAppId(appId)
+    launcherBackend.setState(SparkAppHandle.State.RUNNING)
   }
 
   override def stop() {
-    localEndpoint.ask(StopExecutor)
+    stop(SparkAppHandle.State.FINISHED)
   }
 
   override def reviveOffers() {
@@ -145,4 +153,13 @@ private[spark] class LocalBackend(
 
   override def applicationId(): String = appId
 
+  private def stop(finalState: SparkAppHandle.State): Unit = {
+    localEndpoint.ask(StopExecutor)
+    try {
+      launcherBackend.setState(finalState)
+    } finally {
+      launcherBackend.close()
+    }
+  }
+
 }
diff --git a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
index d0c26dd05679b..aa15e792e2b27 100644
--- a/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
+++ b/core/src/test/java/org/apache/spark/launcher/SparkLauncherSuite.java
@@ -27,6 +27,7 @@
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.slf4j.bridge.SLF4JBridgeHandler;
 import static org.junit.Assert.*;
 
 /**
@@ -34,7 +35,13 @@
  */
 public class SparkLauncherSuite {
 
+  static {
+    SLF4JBridgeHandler.removeHandlersForRootLogger();
+    SLF4JBridgeHandler.install();
+  }
+
   private static final Logger LOG = LoggerFactory.getLogger(SparkLauncherSuite.class);
+  private static final NamedThreadFactory TF = new NamedThreadFactory("SparkLauncherSuite-%d");
 
   @Test
   public void testSparkArgumentHandling() throws Exception {
@@ -94,14 +101,15 @@ public void testChildProcLauncher() throws Exception {
       .addSparkArg(opts.CONF,
         String.format("%s=-Dfoo=ShouldBeOverriddenBelow", SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS))
       .setConf(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS,
-        "-Dfoo=bar -Dtest.name=-testChildProcLauncher")
+        "-Dfoo=bar -Dtest.appender=childproc")
       .setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path"))
       .addSparkArg(opts.CLASS, "ShouldBeOverriddenBelow")
       .setMainClass(SparkLauncherTestApp.class.getName())
       .addAppArgs("proc");
     final Process app = launcher.launch();
-    new Redirector("stdout", app.getInputStream()).start();
-    new Redirector("stderr", app.getErrorStream()).start();
+
+    new OutputRedirector(app.getInputStream(), TF);
+    new OutputRedirector(app.getErrorStream(), TF);
     assertEquals(0, app.waitFor());
   }
 
@@ -116,29 +124,4 @@ public static void main(String[] args) throws Exception {
 
   }
 
-  private static class Redirector extends Thread {
-
-    private final InputStream in;
-
-    Redirector(String name, InputStream in) {
-      this.in = in;
-      setName(name);
-      setDaemon(true);
-    }
-
-    @Override
-    public void run() {
-      try {
-        BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
-        String line;
-        while ((line = reader.readLine()) != null) {
-          LOG.warn(line);
-        }
-      } catch (Exception e) {
-        LOG.error("Error reading process output.", e);
-      }
-    }
-
-  }
-
 }
diff --git a/core/src/test/resources/log4j.properties b/core/src/test/resources/log4j.properties
index eb3b1999eb996..a54d27de91ed2 100644
--- a/core/src/test/resources/log4j.properties
+++ b/core/src/test/resources/log4j.properties
@@ -16,13 +16,22 @@
 #
 
 # Set everything to be logged to the file target/unit-tests.log
-log4j.rootCategory=INFO, file
+test.appender=file
+log4j.rootCategory=INFO, ${test.appender}
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=true
 log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 
+# Tests that launch java subprocesses can set the "test.appender" system property to
+# "console" to avoid having the child process's logs overwrite the unit test's
+# log file.
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%t: %m%n
+
 # Ignore messages below warning level from Jetty, because it's a bit verbose
 log4j.logger.org.spark-project.jetty=WARN
 org.spark-project.jetty.LEVEL=WARN
diff --git a/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala b/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
new file mode 100644
index 0000000000000..07e8869833e95
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/launcher/LauncherBackendSuite.scala
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher
+
+import java.util.concurrent.TimeUnit
+
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+import org.scalatest.Matchers
+import org.scalatest.concurrent.Eventually._
+
+import org.apache.spark._
+import org.apache.spark.launcher._
+
+class LauncherBackendSuite extends SparkFunSuite with Matchers {
+
+  private val tests = Seq(
+    "local" -> "local",
+    "standalone/client" -> "local-cluster[1,1,1024]")
+
+  tests.foreach { case (name, master) =>
+    test(s"$name: launcher handle") {
+      testWithMaster(master)
+    }
+  }
+
+  private def testWithMaster(master: String): Unit = {
+    val env = new java.util.HashMap[String, String]()
+    env.put("SPARK_PRINT_LAUNCH_COMMAND", "1")
+    val handle = new SparkLauncher(env)
+      .setSparkHome(sys.props("spark.test.home"))
+      .setConf(SparkLauncher.DRIVER_EXTRA_CLASSPATH, System.getProperty("java.class.path"))
+      .setConf("spark.ui.enabled", "false")
+      .setConf(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, s"-Dtest.appender=console")
+      .setMaster(master)
+      .setAppResource("spark-internal")
+      .setMainClass(TestApp.getClass.getName().stripSuffix("$"))
+      .startApplication()
+
+    try {
+      eventually(timeout(10 seconds), interval(100 millis)) {
+        handle.getAppId() should not be (null)
+      }
+
+      handle.stop()
+
+      eventually(timeout(10 seconds), interval(100 millis)) {
+        handle.getState() should be (SparkAppHandle.State.KILLED)
+      }
+    } finally {
+      handle.kill()
+    }
+  }
+
+}
+
+object TestApp {
+
+  def main(args: Array[String]): Unit = {
+    new SparkContext(new SparkConf()).parallelize(Seq(1)).foreach { i =>
+      Thread.sleep(TimeUnit.SECONDS.toMillis(20))
+    }
+  }
+
+}
diff --git a/launcher/pom.xml b/launcher/pom.xml
index d595d74642ab2..5739bfc16958f 100644
--- a/launcher/pom.xml
+++ b/launcher/pom.xml
@@ -47,6 +47,11 @@
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>jul-to-slf4j</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-api</artifactId>
diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index 610e8bdaaa639..cf3729b7febc3 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -47,7 +47,7 @@ abstract class AbstractCommandBuilder {
   String javaHome;
   String mainClass;
   String master;
-  String propertiesFile;
+  protected String propertiesFile;
   final List<String> appArgs;
   final List<String> jars;
   final List<String> files;
@@ -55,6 +55,10 @@ abstract class AbstractCommandBuilder {
   final Map<String, String> childEnv;
   final Map<String, String> conf;
 
+  // The merged configuration for the application. Cached to avoid having to read / parse
+  // properties files multiple times.
+  private Map<String, String> effectiveConfig;
+
   public AbstractCommandBuilder() {
     this.appArgs = new ArrayList<String>();
     this.childEnv = new HashMap<String, String>();
@@ -257,12 +261,38 @@ String getSparkHome() {
     return path;
   }
 
+  String getenv(String key) {
+    return firstNonEmpty(childEnv.get(key), System.getenv(key));
+  }
+
+  void setPropertiesFile(String path) {
+    effectiveConfig = null;
+    this.propertiesFile = path;
+  }
+
+  Map<String, String> getEffectiveConfig() throws IOException {
+    if (effectiveConfig == null) {
+      if (propertiesFile == null) {
+        effectiveConfig = conf;
+      } else {
+        effectiveConfig = new HashMap<>(conf);
+        Properties p = loadPropertiesFile();
+        for (String key : p.stringPropertyNames()) {
+          if (!effectiveConfig.containsKey(key)) {
+            effectiveConfig.put(key, p.getProperty(key));
+          }
+        }
+      }
+    }
+    return effectiveConfig;
+  }
+
   /**
    * Loads the configuration file for the application, if it exists. This is either the
    * user-specified properties file, or the spark-defaults.conf file under the Spark configuration
    * directory.
    */
-  Properties loadPropertiesFile() throws IOException {
+  private Properties loadPropertiesFile() throws IOException {
     Properties props = new Properties();
     File propsFile;
     if (propertiesFile != null) {
@@ -294,10 +324,6 @@ Properties loadPropertiesFile() throws IOException {
     return props;
   }
 
-  String getenv(String key) {
-    return firstNonEmpty(childEnv.get(key), System.getenv(key));
-  }
-
   private String findAssembly() {
     String sparkHome = getSparkHome();
     File libdir;
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
new file mode 100644
index 0000000000000..de50f14fbdc87
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ThreadFactory;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Handle implementation for monitoring apps started as a child process.
+ */
+class ChildProcAppHandle implements SparkAppHandle {
+
+  private static final Logger LOG = Logger.getLogger(ChildProcAppHandle.class.getName());
+  private static final ThreadFactory REDIRECTOR_FACTORY =
+    new NamedThreadFactory("launcher-proc-%d");
+
+  private final String secret;
+  private final LauncherServer server;
+
+  private Process childProc;
+  private boolean disposed;
+  private LauncherConnection connection;
+  private List<Listener> listeners;
+  private State state;
+  private String appId;
+  private OutputRedirector redirector;
+
+  ChildProcAppHandle(String secret, LauncherServer server) {
+    this.secret = secret;
+    this.server = server;
+    this.state = State.UNKNOWN;
+  }
+
+  @Override
+  public synchronized void addListener(Listener l) {
+    if (listeners == null) {
+      listeners = new ArrayList<>();
+    }
+    listeners.add(l);
+  }
+
+  @Override
+  public State getState() {
+    return state;
+  }
+
+  @Override
+  public String getAppId() {
+    return appId;
+  }
+
+  @Override
+  public void stop() {
+    CommandBuilderUtils.checkState(connection != null, "Application is still not connected.");
+    try {
+      connection.send(new LauncherProtocol.Stop());
+    } catch (IOException ioe) {
+      throw new RuntimeException(ioe);
+    }
+  }
+
+  @Override
+  public synchronized void disconnect() {
+    if (!disposed) {
+      disposed = true;
+      if (connection != null) {
+        try {
+          connection.close();
+        } catch (IOException ioe) {
+          // no-op.
+        }
+      }
+      server.unregister(this);
+      if (redirector != null) {
+        redirector.stop();
+      }
+    }
+  }
+
+  @Override
+  public synchronized void kill() {
+    if (!disposed) {
+      disconnect();
+    }
+    if (childProc != null) {
+      childProc.destroy();
+      childProc = null;
+    }
+  }
+
+  String getSecret() {
+    return secret;
+  }
+
+  void setChildProc(Process childProc, String loggerName) {
+    this.childProc = childProc;
+    this.redirector = new OutputRedirector(childProc.getInputStream(), loggerName,
+      REDIRECTOR_FACTORY);
+  }
+
+  void setConnection(LauncherConnection connection) {
+    this.connection = connection;
+  }
+
+  LauncherServer getServer() {
+    return server;
+  }
+
+  LauncherConnection getConnection() {
+    return connection;
+  }
+
+  void setState(State s) {
+    if (!state.isFinal()) {
+      state = s;
+      fireEvent(false);
+    } else {
+      LOG.log(Level.WARNING, "Backend requested transition from final state {0} to {1}.",
+        new Object[] { state, s });
+    }
+  }
+
+  void setAppId(String appId) {
+    this.appId = appId;
+    fireEvent(true);
+  }
+
+  private synchronized void fireEvent(boolean isInfoChanged) {
+    if (listeners != null) {
+      for (Listener l : listeners) {
+        if (isInfoChanged) {
+          l.infoChanged(this);
+        } else {
+          l.stateChanged(this);
+        }
+      }
+    }
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
new file mode 100644
index 0000000000000..eec264909bbb6
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherConnection.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.net.Socket;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import static org.apache.spark.launcher.LauncherProtocol.*;
+
+/**
+ * Encapsulates a connection between a launcher server and client. This takes care of the
+ * communication (sending and receiving messages), while processing of messages is left for
+ * the implementations.
+ */
+abstract class LauncherConnection implements Closeable, Runnable {
+
+  private static final Logger LOG = Logger.getLogger(LauncherConnection.class.getName());
+
+  private final Socket socket;
+  private final ObjectOutputStream out;
+
+  private volatile boolean closed;
+
+  LauncherConnection(Socket socket) throws IOException {
+    this.socket = socket;
+    this.out = new ObjectOutputStream(socket.getOutputStream());
+    this.closed = false;
+  }
+
+  protected abstract void handle(Message msg) throws IOException;
+
+  @Override
+  public void run() {
+    try {
+      ObjectInputStream in = new ObjectInputStream(socket.getInputStream());
+      while (!closed) {
+        Message msg = (Message) in.readObject();
+        handle(msg);
+      }
+    } catch (EOFException eof) {
+      // Remote side has closed the connection, just cleanup.
+      try {
+        close();
+      } catch (Exception unused) {
+        // no-op.
+      }
+    } catch (Exception e) {
+      if (!closed) {
+        LOG.log(Level.WARNING, "Error in inbound message handling.", e);
+        try {
+          close();
+        } catch (Exception unused) {
+          // no-op.
+        }
+      }
+    }
+  }
+
+  protected synchronized void send(Message msg) throws IOException {
+    try {
+      CommandBuilderUtils.checkState(!closed, "Disconnected.");
+      out.writeObject(msg);
+      out.flush();
+    } catch (IOException ioe) {
+      if (!closed) {
+        LOG.log(Level.WARNING, "Error when sending message.", ioe);
+        try {
+          close();
+        } catch (Exception unused) {
+          // no-op.
+        }
+      }
+      throw ioe;
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    if (!closed) {
+      synchronized (this) {
+        if (!closed) {
+          closed = true;
+          socket.close();
+        }
+      }
+    }
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
new file mode 100644
index 0000000000000..50f136497ec1a
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherProtocol.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.Serializable;
+import java.net.Socket;
+import java.util.Map;
+
+/**
+ * Message definitions for the launcher communication protocol. These messages must remain
+ * backwards-compatible, so that the launcher can talk to older versions of Spark that support
+ * the protocol.
+ */
+final class LauncherProtocol {
+
+  /** Environment variable where the server port is stored. */
+  static final String ENV_LAUNCHER_PORT = "_SPARK_LAUNCHER_PORT";
+
+  /** Environment variable where the secret for connecting back to the server is stored. */
+  static final String ENV_LAUNCHER_SECRET = "_SPARK_LAUNCHER_SECRET";
+
+  static class Message implements Serializable {
+
+  }
+
+  /**
+   * Hello message, sent from client to server.
+   */
+  static class Hello extends Message {
+
+    final String secret;
+    final String sparkVersion;
+
+    Hello(String secret, String version) {
+      this.secret = secret;
+      this.sparkVersion = version;
+    }
+
+  }
+
+  /**
+   * SetAppId message, sent from client to server.
+   */
+  static class SetAppId extends Message {
+
+    final String appId;
+
+    SetAppId(String appId) {
+      this.appId = appId;
+    }
+
+  }
+
+  /**
+   * SetState message, sent from client to server.
+   */
+  static class SetState extends Message {
+
+    final SparkAppHandle.State state;
+
+    SetState(SparkAppHandle.State state) {
+      this.state = state;
+    }
+
+  }
+
+  /**
+   * Stop message, send from server to client to stop the application.
+   */
+  static class Stop extends Message {
+
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
new file mode 100644
index 0000000000000..c5fd40816d62f
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.InetSocketAddress;
+import java.net.ServerSocket;
+import java.net.Socket;
+import java.security.SecureRandom;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Timer;
+import java.util.TimerTask;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import static org.apache.spark.launcher.LauncherProtocol.*;
+
+/**
+ * A server that listens locally for connections from client launched by the library. Each client
+ * has a secret that it needs to send to the server to identify itself and establish the session.
+ *
+ * I/O is currently blocking (one thread per client). Clients have a limited time to connect back
+ * to the server, otherwise the server will ignore the connection.
+ *
+ * === Architecture Overview ===
+ *
+ * The launcher server is used when Spark apps are launched as separate processes than the calling
+ * app. It looks more or less like the following:
+ *
+ *         -----------------------                       -----------------------
+ *         |      User App       |     spark-submit      |      Spark App      |
+ *         |                     |  -------------------> |                     |
+ *         |         ------------|                       |-------------        |
+ *         |         |           |        hello          |            |        |
+ *         |         | L. Server |<----------------------| L. Backend |        |
+ *         |         |           |                       |            |        |
+ *         |         -------------                       -----------------------
+ *         |               |     |                              ^
+ *         |               v     |                              |
+ *         |        -------------|                              |
+ *         |        |            |      <per-app channel>       |
+ *         |        | App Handle |<------------------------------
+ *         |        |            |
+ *         -----------------------
+ *
+ * The server is started on demand and remains active while there are active or outstanding clients,
+ * to avoid opening too many ports when multiple clients are launched. Each client is given a unique
+ * secret, and have a limited amount of time to connect back
+ * ({@link SparkLauncher#CHILD_CONNECTION_TIMEOUT}), at which point the server will throw away
+ * that client's state. A client is only allowed to connect back to the server once.
+ *
+ * The launcher server listens on the localhost only, so it doesn't need access controls (aside from
+ * the per-app secret) nor encryption. It thus requires that the launched app has a local process
+ * that communicates with the server. In cluster mode, this means that the client that launches the
+ * application must remain alive for the duration of the application (or until the app handle is
+ * disconnected).
+ */
+class LauncherServer implements Closeable {
+
+  private static final Logger LOG = Logger.getLogger(LauncherServer.class.getName());
+  private static final String THREAD_NAME_FMT = "LauncherServer-%d";
+  private static final long DEFAULT_CONNECT_TIMEOUT = 10000L;
+
+  /** For creating secrets used for communication with child processes. */
+  private static final SecureRandom RND = new SecureRandom();
+
+  private static volatile LauncherServer serverInstance;
+
+  /**
+   * Creates a handle for an app to be launched. This method will start a server if one hasn't been
+   * started yet. The server is shared for multiple handles, and once all handles are disposed of,
+   * the server is shut down.
+   */
+  static synchronized ChildProcAppHandle newAppHandle() throws IOException {
+    LauncherServer server = serverInstance != null ? serverInstance : new LauncherServer();
+    server.ref();
+    serverInstance = server;
+
+    String secret = server.createSecret();
+    while (server.pending.containsKey(secret)) {
+      secret = server.createSecret();
+    }
+
+    return server.newAppHandle(secret);
+  }
+
+  static LauncherServer getServerInstance() {
+    return serverInstance;
+  }
+
+  private final AtomicLong refCount;
+  private final AtomicLong threadIds;
+  private final ConcurrentMap<String, ChildProcAppHandle> pending;
+  private final List<ServerConnection> clients;
+  private final ServerSocket server;
+  private final Thread serverThread;
+  private final ThreadFactory factory;
+  private final Timer timeoutTimer;
+
+  private volatile boolean running;
+
+  private LauncherServer() throws IOException {
+    this.refCount = new AtomicLong(0);
+
+    ServerSocket server = new ServerSocket();
+    try {
+      server.setReuseAddress(true);
+      server.bind(new InetSocketAddress(InetAddress.getLoopbackAddress(), 0));
+
+      this.clients = new ArrayList<ServerConnection>();
+      this.threadIds = new AtomicLong();
+      this.factory = new NamedThreadFactory(THREAD_NAME_FMT);
+      this.pending = new ConcurrentHashMap<>();
+      this.timeoutTimer = new Timer("LauncherServer-TimeoutTimer", true);
+      this.server = server;
+      this.running = true;
+
+      this.serverThread = factory.newThread(new Runnable() {
+        @Override
+        public void run() {
+          acceptConnections();
+        }
+      });
+      serverThread.start();
+    } catch (IOException ioe) {
+      close();
+      throw ioe;
+    } catch (Exception e) {
+      close();
+      throw new IOException(e);
+    }
+  }
+
+  /**
+   * Creates a new app handle. The handle will wait for an incoming connection for a configurable
+   * amount of time, and if one doesn't arrive, it will transition to an error state.
+   */
+  ChildProcAppHandle newAppHandle(String secret) {
+    ChildProcAppHandle handle = new ChildProcAppHandle(secret, this);
+    ChildProcAppHandle existing = pending.putIfAbsent(secret, handle);
+    CommandBuilderUtils.checkState(existing == null, "Multiple handles with the same secret.");
+    return handle;
+  }
+
+  @Override
+  public void close() throws IOException {
+    synchronized (this) {
+      if (running) {
+        running = false;
+        timeoutTimer.cancel();
+        server.close();
+        synchronized (clients) {
+          List<ServerConnection> copy = new ArrayList<>(clients);
+          clients.clear();
+          for (ServerConnection client : copy) {
+            client.close();
+          }
+        }
+      }
+    }
+    if (serverThread != null) {
+      try {
+        serverThread.join();
+      } catch (InterruptedException ie) {
+        // no-op
+      }
+    }
+  }
+
+  void ref() {
+    refCount.incrementAndGet();
+  }
+
+  void unref() {
+    synchronized(LauncherServer.class) {
+      if (refCount.decrementAndGet() == 0) {
+        try {
+          close();
+        } catch (IOException ioe) {
+          // no-op.
+        } finally {
+          serverInstance = null;
+        }
+      }
+    }
+  }
+
+  int getPort() {
+    return server.getLocalPort();
+  }
+
+  /**
+   * Removes the client handle from the pending list (in case it's still there), and unrefs
+   * the server.
+   */
+  void unregister(ChildProcAppHandle handle) {
+    pending.remove(handle.getSecret());
+    unref();
+  }
+
+  private void acceptConnections() {
+    try {
+      while (running) {
+        final Socket client = server.accept();
+        TimerTask timeout = new TimerTask() {
+          @Override
+          public void run() {
+            LOG.warning("Timed out waiting for hello message from client.");
+            try {
+              client.close();
+            } catch (IOException ioe) {
+              // no-op.
+            }
+          }
+        };
+        ServerConnection clientConnection = new ServerConnection(client, timeout);
+        Thread clientThread = factory.newThread(clientConnection);
+        synchronized (timeout) {
+          clientThread.start();
+          synchronized (clients) {
+            clients.add(clientConnection);
+          }
+          timeoutTimer.schedule(timeout, getConnectionTimeout());
+        }
+      }
+    } catch (IOException ioe) {
+      if (running) {
+        LOG.log(Level.SEVERE, "Error in accept loop.", ioe);
+      }
+    }
+  }
+
+  private long getConnectionTimeout() {
+    String value = SparkLauncher.launcherConfig.get(SparkLauncher.CHILD_CONNECTION_TIMEOUT);
+    return (value != null) ? Long.parseLong(value) : DEFAULT_CONNECT_TIMEOUT;
+  }
+
+  private String createSecret() {
+    byte[] secret = new byte[128];
+    RND.nextBytes(secret);
+
+    StringBuilder sb = new StringBuilder();
+    for (byte b : secret) {
+      int ival = b >= 0 ? b : Byte.MAX_VALUE - b;
+      if (ival < 0x10) {
+        sb.append("0");
+      }
+      sb.append(Integer.toHexString(ival));
+    }
+    return sb.toString();
+  }
+
+  private class ServerConnection extends LauncherConnection {
+
+    private TimerTask timeout;
+    private ChildProcAppHandle handle;
+
+    ServerConnection(Socket socket, TimerTask timeout) throws IOException {
+      super(socket);
+      this.timeout = timeout;
+    }
+
+    @Override
+    protected void handle(Message msg) throws IOException {
+      try {
+        if (msg instanceof Hello) {
+          synchronized (timeout) {
+            timeout.cancel();
+          }
+          timeout = null;
+          Hello hello = (Hello) msg;
+          ChildProcAppHandle handle = pending.remove(hello.secret);
+          if (handle != null) {
+            handle.setState(SparkAppHandle.State.CONNECTED);
+            handle.setConnection(this);
+            this.handle = handle;
+          } else {
+            throw new IllegalArgumentException("Received Hello for unknown client.");
+          }
+        } else {
+          if (handle == null) {
+            throw new IllegalArgumentException("Expected hello, got: " +
+            msg != null ? msg.getClass().getName() : null);
+          }
+          if (msg instanceof SetAppId) {
+            SetAppId set = (SetAppId) msg;
+            handle.setAppId(set.appId);
+          } else if (msg instanceof SetState) {
+            handle.setState(((SetState)msg).state);
+          } else {
+            throw new IllegalArgumentException("Invalid message: " +
+              msg != null ? msg.getClass().getName() : null);
+          }
+        }
+      } catch (Exception e) {
+        LOG.log(Level.INFO, "Error handling message from client.", e);
+        if (timeout != null) {
+          timeout.cancel();
+        }
+        close();
+      } finally {
+        timeoutTimer.purge();
+      }
+    }
+
+    @Override
+    public void close() throws IOException {
+      synchronized (clients) {
+        clients.remove(this);
+      }
+      super.close();
+      if (handle != null) {
+        handle.disconnect();
+      }
+    }
+
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java b/launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java
new file mode 100644
index 0000000000000..995f4d73daaaf
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/NamedThreadFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.atomic.AtomicLong;
+
+class NamedThreadFactory implements ThreadFactory {
+
+  private final String nameFormat;
+  private final AtomicLong threadIds;
+
+  NamedThreadFactory(String nameFormat) {
+    this.nameFormat = nameFormat;
+    this.threadIds = new AtomicLong();
+  }
+
+  @Override
+  public Thread newThread(Runnable r) {
+    Thread t = new Thread(r, String.format(nameFormat, threadIds.incrementAndGet()));
+    t.setDaemon(true);
+    return t;
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
new file mode 100644
index 0000000000000..6e7120167d605
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/OutputRedirector.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.util.concurrent.ThreadFactory;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * Redirects lines read from a given input stream to a j.u.l.Logger (at INFO level).
+ */
+class OutputRedirector {
+
+  private final BufferedReader reader;
+  private final Logger sink;
+  private final Thread thread;
+
+  private volatile boolean active;
+
+  OutputRedirector(InputStream in, ThreadFactory tf) {
+    this(in, OutputRedirector.class.getName(), tf);
+  }
+
+  OutputRedirector(InputStream in, String loggerName, ThreadFactory tf) {
+    this.active = true;
+    this.reader = new BufferedReader(new InputStreamReader(in));
+    this.thread = tf.newThread(new Runnable() {
+      @Override
+      public void run() {
+        redirect();
+      }
+    });
+    this.sink = Logger.getLogger(loggerName);
+    thread.start();
+  }
+
+  private void redirect() {
+    try {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        if (active) {
+          sink.info(line.replaceFirst("\\s*$", ""));
+        }
+      }
+    } catch (IOException e) {
+      sink.log(Level.FINE, "Error reading child process output.", e);
+    }
+  }
+
+  /**
+   * This method just stops the output of the process from showing up in the local logs.
+   * The child's output will still be read (and, thus, the redirect thread will still be
+   * alive) to avoid the child process hanging because of lack of output buffer.
+   */
+  void stop() {
+    active = false;
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
new file mode 100644
index 0000000000000..2896a91d5e793
--- /dev/null
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+/**
+ * A handle to a running Spark application.
+ * <p/>
+ * Provides runtime information about the underlying Spark application, and actions to control it.
+ *
+ * @since 1.6.0
+ */
+public interface SparkAppHandle {
+
+  /**
+   * Represents the application's state. A state can be "final", in which case it will not change
+   * after it's reached, and means the application is not running anymore.
+   *
+   * @since 1.6.0
+   */
+  public enum State {
+    /** The application has not reported back yet. */
+    UNKNOWN(false),
+    /** The application has connected to the handle. */
+    CONNECTED(false),
+    /** The application has been submitted to the cluster. */
+    SUBMITTED(false),
+    /** The application is running. */
+    RUNNING(false),
+    /** The application finished with a successful status. */
+    FINISHED(true),
+    /** The application finished with a failed status. */
+    FAILED(true),
+    /** The application was killed. */
+    KILLED(true);
+
+    private final boolean isFinal;
+
+    State(boolean isFinal) {
+      this.isFinal = isFinal;
+    }
+
+    /**
+     * Whether this state is a final state, meaning the application is not running anymore
+     * once it's reached.
+     */
+    public boolean isFinal() {
+      return isFinal;
+    }
+  }
+
+  /**
+   * Adds a listener to be notified of changes to the handle's information. Listeners will be called
+   * from the thread processing updates from the application, so they should avoid blocking or
+   * long-running operations.
+   *
+   * @param l Listener to add.
+   */
+  void addListener(Listener l);
+
+  /** Returns the current application state. */
+  State getState();
+
+  /** Returns the application ID, or <code>null</code> if not yet known. */
+  String getAppId();
+
+  /**
+   * Asks the application to stop. This is best-effort, since the application may fail to receive
+   * or act on the command. Callers should watch for a state transition that indicates the
+   * application has really stopped.
+   */
+  void stop();
+
+  /**
+   * Tries to kill the underlying application. Implies {@link #disconnect()}. This will not send
+   * a {@link #stop()} message to the application, so it's recommended that users first try to
+   * stop the application cleanly and only resort to this method if that fails.
+   */
+  void kill();
+
+  /**
+   * Disconnects the handle from the application, without stopping it. After this method is called,
+   * the handle will not be able to communicate with the application anymore.
+   */
+  void disconnect();
+
+  /**
+   * Listener for updates to a handle's state. The callbacks do not receive information about
+   * what exactly has changed, just that an update has occurred.
+   *
+   * @since 1.6.0
+   */
+  public interface Listener {
+
+    /**
+     * Callback for changes in the handle's state.
+     *
+     * @param handle The updated handle.
+     * @see {@link SparkAppHandle#getState()}
+     */
+    void stateChanged(SparkAppHandle handle);
+
+    /**
+     * Callback for changes in any information that is not the handle's state.
+     *
+     * @param handle The updated handle.
+     */
+    void infoChanged(SparkAppHandle handle);
+
+  }
+
+}
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
index 57993405e47be..5d74b37033a51 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkLauncher.java
@@ -21,8 +21,10 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import static org.apache.spark.launcher.CommandBuilderUtils.*;
 
@@ -58,6 +60,33 @@ public class SparkLauncher {
   /** Configuration key for the number of executor CPU cores. */
   public static final String EXECUTOR_CORES = "spark.executor.cores";
 
+  /** Logger name to use when launching a child process. */
+  public static final String CHILD_PROCESS_LOGGER_NAME = "spark.launcher.childProcLoggerName";
+
+  /**
+   * Maximum time (in ms) to wait for a child process to connect back to the launcher server
+   * when using @link{#start()}.
+   */
+  public static final String CHILD_CONNECTION_TIMEOUT = "spark.launcher.childConectionTimeout";
+
+  /** Used internally to create unique logger names. */
+  private static final AtomicInteger COUNTER = new AtomicInteger();
+
+  static final Map<String, String> launcherConfig = new HashMap<String, String>();
+
+  /**
+   * Set a configuration value for the launcher library. These config values do not affect the
+   * launched application, but rather the behavior of the launcher library itself when managing
+   * applications.
+   *
+   * @since 1.6.0
+   * @param name Config name.
+   * @param value Config value.
+   */
+  public static void setConfig(String name, String value) {
+    launcherConfig.put(name, value);
+  }
+
   // Visible for testing.
   final SparkSubmitCommandBuilder builder;
 
@@ -109,7 +138,7 @@ public SparkLauncher setSparkHome(String sparkHome) {
    */
   public SparkLauncher setPropertiesFile(String path) {
     checkNotNull(path, "path");
-    builder.propertiesFile = path;
+    builder.setPropertiesFile(path);
     return this;
   }
 
@@ -197,6 +226,7 @@ public SparkLauncher setMainClass(String mainClass) {
    * Use this method with caution. It is possible to create an invalid Spark command by passing
    * unknown arguments to this method, since those are allowed for forward compatibility.
    *
+   * @since 1.5.0
    * @param arg Argument to add.
    * @return This launcher.
    */
@@ -218,6 +248,7 @@ public SparkLauncher addSparkArg(String arg) {
    * Use this method with caution. It is possible to create an invalid Spark command by passing
    * unknown arguments to this method, since those are allowed for forward compatibility.
    *
+   * @since 1.5.0
    * @param name Name of argument to add.
    * @param value Value of the argument.
    * @return This launcher.
@@ -319,10 +350,81 @@ public SparkLauncher setVerbose(boolean verbose) {
 
   /**
    * Launches a sub-process that will start the configured Spark application.
+   * <p/>
+   * The {@link #startApplication(SparkAppHandle.Listener...)} method is preferred when launching
+   * Spark, since it provides better control of the child application.
    *
    * @return A process handle for the Spark app.
    */
   public Process launch() throws IOException {
+    return createBuilder().start();
+  }
+
+  /**
+   * Starts a Spark application.
+   * <p/>
+   * This method returns a handle that provides information about the running application and can
+   * be used to do basic interaction with it.
+   * <p/>
+   * The returned handle assumes that the application will instantiate a single SparkContext
+   * during its lifetime. Once that context reports a final state (one that indicates the
+   * SparkContext has stopped), the handle will not perform new state transitions, so anything
+   * that happens after that cannot be monitored. If the underlying application is launched as
+   * a child process, {@link SparkAppHandle#kill()} can still be used to kill the child process.
+   * <p/>
+   * Currently, all applications are launched as child processes. The child's stdout and stderr
+   * are merged and written to a logger (see <code>java.util.logging</code>). The logger's name
+   * can be defined by setting {@link #CHILD_PROCESS_LOGGER_NAME} in the app's configuration. If
+   * that option is not set, the code will try to derive a name from the application's name or
+   * main class / script file. If those cannot be determined, an internal, unique name will be
+   * used. In all cases, the logger name will start with "org.apache.spark.launcher.app", to fit
+   * more easily into the configuration of commonly-used logging systems.
+   *
+   * @since 1.6.0
+   * @param listeners Listeners to add to the handle before the app is launched.
+   * @return A handle for the launched application.
+   */
+  public SparkAppHandle startApplication(SparkAppHandle.Listener... listeners) throws IOException {
+    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    for (SparkAppHandle.Listener l : listeners) {
+      handle.addListener(l);
+    }
+
+    String appName = builder.getEffectiveConfig().get(CHILD_PROCESS_LOGGER_NAME);
+    if (appName == null) {
+      if (builder.appName != null) {
+        appName = builder.appName;
+      } else if (builder.mainClass != null) {
+        int dot = builder.mainClass.lastIndexOf(".");
+        if (dot >= 0 && dot < builder.mainClass.length() - 1) {
+          appName = builder.mainClass.substring(dot + 1, builder.mainClass.length());
+        } else {
+          appName = builder.mainClass;
+        }
+      } else if (builder.appResource != null) {
+        appName = new File(builder.appResource).getName();
+      } else {
+        appName = String.valueOf(COUNTER.incrementAndGet());
+      }
+    }
+
+    String loggerPrefix = getClass().getPackage().getName();
+    String loggerName = String.format("%s.app.%s", loggerPrefix, appName);
+    ProcessBuilder pb = createBuilder().redirectErrorStream(true);
+    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_PORT,
+      String.valueOf(LauncherServer.getServerInstance().getPort()));
+    pb.environment().put(LauncherProtocol.ENV_LAUNCHER_SECRET, handle.getSecret());
+    try {
+      handle.setChildProc(pb.start(), loggerName);
+    } catch (IOException ioe) {
+      handle.kill();
+      throw ioe;
+    }
+
+    return handle;
+  }
+
+  private ProcessBuilder createBuilder() {
     List<String> cmd = new ArrayList<String>();
     String script = isWindows() ? "spark-submit.cmd" : "spark-submit";
     cmd.add(join(File.separator, builder.getSparkHome(), "bin", script));
@@ -343,7 +445,7 @@ public Process launch() throws IOException {
     for (Map.Entry<String, String> e : builder.childEnv.entrySet()) {
       pb.environment().put(e.getKey(), e.getValue());
     }
-    return pb.start();
+    return pb;
   }
 
   private static class ArgumentValidator extends SparkSubmitOptionParser {
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
index fc87814a59ed5..39b46e0db8cc2 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/SparkSubmitCommandBuilder.java
@@ -188,10 +188,9 @@ private List<String> buildSparkSubmitCommand(Map<String, String> env) throws IOE
     // Load the properties file and check whether spark-submit will be running the app's driver
     // or just launching a cluster app. When running the driver, the JVM's argument will be
     // modified to cover the driver's configuration.
-    Properties props = loadPropertiesFile();
-    boolean isClientMode = isClientMode(props);
-    String extraClassPath = isClientMode ?
-      firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_CLASSPATH, conf, props) : null;
+    Map<String, String> config = getEffectiveConfig();
+    boolean isClientMode = isClientMode(config);
+    String extraClassPath = isClientMode ? config.get(SparkLauncher.DRIVER_EXTRA_CLASSPATH) : null;
 
     List<String> cmd = buildJavaCommand(extraClassPath);
     // Take Thrift Server as daemon
@@ -212,14 +211,13 @@ private List<String> buildSparkSubmitCommand(Map<String, String> env) throws IOE
       // Take Thrift Server as daemon
       String tsMemory =
         isThriftServer(mainClass) ? System.getenv("SPARK_DAEMON_MEMORY") : null;
-      String memory = firstNonEmpty(tsMemory,
-        firstNonEmptyValue(SparkLauncher.DRIVER_MEMORY, conf, props),
+      String memory = firstNonEmpty(tsMemory, config.get(SparkLauncher.DRIVER_MEMORY),
         System.getenv("SPARK_DRIVER_MEMORY"), System.getenv("SPARK_MEM"), DEFAULT_MEM);
       cmd.add("-Xms" + memory);
       cmd.add("-Xmx" + memory);
-      addOptionString(cmd, firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, conf, props));
+      addOptionString(cmd, config.get(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS));
       mergeEnvPathList(env, getLibPathEnvName(),
-        firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, conf, props));
+        config.get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH));
     }
 
     addPermGenSizeOpt(cmd);
@@ -281,9 +279,8 @@ private List<String> buildSparkRCommand(Map<String, String> env) throws IOExcept
   private void constructEnvVarArgs(
       Map<String, String> env,
       String submitArgsEnvVariable) throws IOException {
-    Properties props = loadPropertiesFile();
     mergeEnvPathList(env, getLibPathEnvName(),
-      firstNonEmptyValue(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, conf, props));
+      getEffectiveConfig().get(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH));
 
     StringBuilder submitArgs = new StringBuilder();
     for (String arg : buildSparkSubmitArgs()) {
@@ -295,9 +292,8 @@ private void constructEnvVarArgs(
     env.put(submitArgsEnvVariable, submitArgs.toString());
   }
 
-
-  private boolean isClientMode(Properties userProps) {
-    String userMaster = firstNonEmpty(master, (String) userProps.get(SparkLauncher.SPARK_MASTER));
+  private boolean isClientMode(Map<String, String> userProps) {
+    String userMaster = firstNonEmpty(master, userProps.get(SparkLauncher.SPARK_MASTER));
     // Default master is "local[*]", so assume client mode in that case.
     return userMaster == null ||
       "client".equals(deployMode) ||
diff --git a/launcher/src/main/java/org/apache/spark/launcher/package-info.java b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
index 7c97dba511b28..d1ac39bdc76a9 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/package-info.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/package-info.java
@@ -17,17 +17,42 @@
 
 /**
  * Library for launching Spark applications.
- * 
+ *
  * <p>
  * This library allows applications to launch Spark programmatically. There's only one entry
  * point to the library - the {@link org.apache.spark.launcher.SparkLauncher} class.
  * </p>
  *
  * <p>
- * To launch a Spark application, just instantiate a {@link org.apache.spark.launcher.SparkLauncher}
- * and configure the application to run. For example:
+ * The {@link org.apache.spark.launcher.SparkLauncher#startApplication(
+ * org.apache.spark.launcher.SparkAppHandle.Listener...)} can be used to start Spark and provide
+ * a handle to monitor and control the running application:
  * </p>
- * 
+ *
+ * <pre>
+ * {@code
+ *   import org.apache.spark.launcher.SparkAppHandle;
+ *   import org.apache.spark.launcher.SparkLauncher;
+ *
+ *   public class MyLauncher {
+ *     public static void main(String[] args) throws Exception {
+ *       SparkAppHandle handle = new SparkLauncher()
+ *         .setAppResource("/my/app.jar")
+ *         .setMainClass("my.spark.app.Main")
+ *         .setMaster("local")
+ *         .setConf(SparkLauncher.DRIVER_MEMORY, "2g")
+ *         .startApplication();
+ *       // Use handle API to monitor / control application.
+ *     }
+ *   }
+ * }
+ * </pre>
+ *
+ * <p>
+ * It's also possible to launch a raw child process, using the
+ * {@link org.apache.spark.launcher.SparkLauncher#launch()} method:
+ * </p>
+ *
  * <pre>
  * {@code
  *   import org.apache.spark.launcher.SparkLauncher;
@@ -45,5 +70,10 @@
  *   }
  * }
  * </pre>
+ *
+ * <p>This method requires the calling code to manually manage the child process, including its
+ * output streams (to avoid possible deadlocks). It's recommended that
+ * {@link org.apache.spark.launcher.SparkLauncher#startApplication(
+ *   org.apache.spark.launcher.SparkAppHandle.Listener...)} be used instead.</p>
  */
 package org.apache.spark.launcher;
diff --git a/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
new file mode 100644
index 0000000000000..23e2c64d6dcd7
--- /dev/null
+++ b/launcher/src/test/java/org/apache/spark/launcher/BaseSuite.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import org.slf4j.bridge.SLF4JBridgeHandler;
+
+/**
+ * Handles configuring the JUL -> SLF4J bridge.
+ */
+class BaseSuite {
+
+  static {
+    SLF4JBridgeHandler.removeHandlersForRootLogger();
+    SLF4JBridgeHandler.install();
+  }
+
+}
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
new file mode 100644
index 0000000000000..27cd1061a15b3
--- /dev/null
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.launcher;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.Socket;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.*;
+
+import static org.apache.spark.launcher.LauncherProtocol.*;
+
+public class LauncherServerSuite extends BaseSuite {
+
+  @Test
+  public void testLauncherServerReuse() throws Exception {
+    ChildProcAppHandle handle1 = null;
+    ChildProcAppHandle handle2 = null;
+    ChildProcAppHandle handle3 = null;
+
+    try {
+      handle1 = LauncherServer.newAppHandle();
+      handle2 = LauncherServer.newAppHandle();
+      LauncherServer server1 = handle1.getServer();
+      assertSame(server1, handle2.getServer());
+
+      handle1.kill();
+      handle2.kill();
+
+      handle3 = LauncherServer.newAppHandle();
+      assertNotSame(server1, handle3.getServer());
+
+      handle3.kill();
+
+      assertNull(LauncherServer.getServerInstance());
+    } finally {
+      kill(handle1);
+      kill(handle2);
+      kill(handle3);
+    }
+  }
+
+  @Test
+  public void testCommunication() throws Exception {
+    ChildProcAppHandle handle = LauncherServer.newAppHandle();
+    TestClient client = null;
+    try {
+      Socket s = new Socket(InetAddress.getLoopbackAddress(),
+        LauncherServer.getServerInstance().getPort());
+
+      final Object waitLock = new Object();
+      handle.addListener(new SparkAppHandle.Listener() {
+        @Override
+        public void stateChanged(SparkAppHandle handle) {
+          wakeUp();
+        }
+
+        @Override
+        public void infoChanged(SparkAppHandle handle) {
+          wakeUp();
+        }
+
+        private void wakeUp() {
+          synchronized (waitLock) {
+            waitLock.notifyAll();
+          }
+        }
+      });
+
+      client = new TestClient(s);
+      synchronized (waitLock) {
+        client.send(new Hello(handle.getSecret(), "1.4.0"));
+        waitLock.wait(TimeUnit.SECONDS.toMillis(10));
+      }
+
+      // Make sure the server matched the client to the handle.
+      assertNotNull(handle.getConnection());
+
+      synchronized (waitLock) {
+        client.send(new SetAppId("app-id"));
+        waitLock.wait(TimeUnit.SECONDS.toMillis(10));
+      }
+      assertEquals("app-id", handle.getAppId());
+
+      synchronized (waitLock) {
+        client.send(new SetState(SparkAppHandle.State.RUNNING));
+        waitLock.wait(TimeUnit.SECONDS.toMillis(10));
+      }
+      assertEquals(SparkAppHandle.State.RUNNING, handle.getState());
+
+      handle.stop();
+      Message stopMsg = client.inbound.poll(10, TimeUnit.SECONDS);
+      assertTrue(stopMsg instanceof Stop);
+    } finally {
+      kill(handle);
+      close(client);
+      client.clientThread.join();
+    }
+  }
+
+  @Test
+  public void testTimeout() throws Exception {
+    final long TEST_TIMEOUT = 10L;
+
+    ChildProcAppHandle handle = null;
+    TestClient client = null;
+    try {
+      SparkLauncher.setConfig(SparkLauncher.CHILD_CONNECTION_TIMEOUT, String.valueOf(TEST_TIMEOUT));
+
+      handle = LauncherServer.newAppHandle();
+
+      Socket s = new Socket(InetAddress.getLoopbackAddress(),
+        LauncherServer.getServerInstance().getPort());
+      client = new TestClient(s);
+
+      Thread.sleep(TEST_TIMEOUT * 10);
+      try {
+        client.send(new Hello(handle.getSecret(), "1.4.0"));
+        fail("Expected exception caused by connection timeout.");
+      } catch (IllegalStateException e) {
+        // Expected.
+      }
+    } finally {
+      SparkLauncher.launcherConfig.remove(SparkLauncher.CHILD_CONNECTION_TIMEOUT);
+      kill(handle);
+      close(client);
+    }
+  }
+
+  private void kill(SparkAppHandle handle) {
+    if (handle != null) {
+      handle.kill();
+    }
+  }
+
+  private void close(Closeable c) {
+    if (c != null) {
+      try {
+        c.close();
+      } catch (Exception e) {
+        // no-op.
+      }
+    }
+  }
+
+  private static class TestClient extends LauncherConnection {
+
+    final BlockingQueue<Message> inbound;
+    final Thread clientThread;
+
+    TestClient(Socket s) throws IOException {
+      super(s);
+      this.inbound = new LinkedBlockingQueue<Message>();
+      this.clientThread = new Thread(this);
+      clientThread.setName("TestClient");
+      clientThread.setDaemon(true);
+      clientThread.start();
+    }
+
+    @Override
+    protected void handle(Message msg) throws IOException {
+      inbound.offer(msg);
+    }
+
+  }
+
+}
diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
index 7329ac9f7fb8c..d5397b0685046 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
@@ -30,7 +30,7 @@
 import org.junit.Test;
 import static org.junit.Assert.*;
 
-public class SparkSubmitCommandBuilderSuite {
+public class SparkSubmitCommandBuilderSuite extends BaseSuite {
 
   private static File dummyPropsFile;
   private static SparkSubmitOptionParser parser;
@@ -161,7 +161,7 @@ private void testCmdBuilder(boolean isDriver) throws Exception {
     launcher.appResource = "/foo";
     launcher.appName = "MyApp";
     launcher.mainClass = "my.Class";
-    launcher.propertiesFile = dummyPropsFile.getAbsolutePath();
+    launcher.setPropertiesFile(dummyPropsFile.getAbsolutePath());
     launcher.appArgs.add("foo");
     launcher.appArgs.add("bar");
     launcher.conf.put(SparkLauncher.DRIVER_MEMORY, "1g");
diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java
index f3d2109917056..3ee5b8cf9689d 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitOptionParserSuite.java
@@ -28,7 +28,7 @@
 
 import static org.apache.spark.launcher.SparkSubmitOptionParser.*;
 
-public class SparkSubmitOptionParserSuite {
+public class SparkSubmitOptionParserSuite extends BaseSuite {
 
   private SparkSubmitOptionParser parser;
 
diff --git a/launcher/src/test/resources/log4j.properties b/launcher/src/test/resources/log4j.properties
index 67a6a98217118..c64b1565e1469 100644
--- a/launcher/src/test/resources/log4j.properties
+++ b/launcher/src/test/resources/log4j.properties
@@ -16,16 +16,19 @@
 #
 
 # Set everything to be logged to the file core/target/unit-tests.log
-log4j.rootCategory=INFO, file
+test.appender=file
+log4j.rootCategory=INFO, ${test.appender}
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-
-# Some tests will set "test.name" to avoid overwriting the main log file.
-log4j.appender.file.file=target/unit-tests${test.name}.log
-
+log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 
+log4j.appender.childproc=org.apache.log4j.ConsoleAppender
+log4j.appender.childproc.target=System.err
+log4j.appender.childproc.layout=org.apache.log4j.PatternLayout
+log4j.appender.childproc.layout.ConversionPattern=%t: %m%n
+
 # Ignore messages below warning level from Jetty, because it's a bit verbose
 log4j.logger.org.spark-project.jetty=WARN
 org.spark-project.jetty.LEVEL=WARN
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index eb3b7fb885087..cec81b940644c 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -55,8 +55,8 @@ import org.apache.hadoop.yarn.exceptions.ApplicationNotFoundException
 import org.apache.hadoop.yarn.util.Records
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkContext, SparkException}
+import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle, YarnCommandBuilderUtils}
 import org.apache.spark.deploy.SparkHadoopUtil
-import org.apache.spark.launcher.YarnCommandBuilderUtils
 import org.apache.spark.util.Utils
 
 private[spark] class Client(
@@ -70,8 +70,6 @@ private[spark] class Client(
   def this(clientArgs: ClientArguments, spConf: SparkConf) =
     this(clientArgs, SparkHadoopUtil.get.newConfiguration(spConf), spConf)
 
-  def this(clientArgs: ClientArguments) = this(clientArgs, new SparkConf())
-
   private val yarnClient = YarnClient.createYarnClient
   private val yarnConf = new YarnConfiguration(hadoopConf)
   private var credentials: Credentials = null
@@ -84,10 +82,27 @@ private[spark] class Client(
   private var principal: String = null
   private var keytab: String = null
 
+  private val launcherBackend = new LauncherBackend() {
+    override def onStopRequest(): Unit = {
+      if (isClusterMode && appId != null) {
+        yarnClient.killApplication(appId)
+      } else {
+        setState(SparkAppHandle.State.KILLED)
+        stop()
+      }
+    }
+  }
   private val fireAndForget = isClusterMode &&
     !sparkConf.getBoolean("spark.yarn.submit.waitAppCompletion", true)
 
+  private var appId: ApplicationId = null
+
+  def reportLauncherState(state: SparkAppHandle.State): Unit = {
+    launcherBackend.setState(state)
+  }
+
   def stop(): Unit = {
+    launcherBackend.close()
     yarnClient.stop()
     // Unset YARN mode system env variable, to allow switching between cluster types.
     System.clearProperty("SPARK_YARN_MODE")
@@ -103,6 +118,7 @@ private[spark] class Client(
   def submitApplication(): ApplicationId = {
     var appId: ApplicationId = null
     try {
+      launcherBackend.connect()
       // Setup the credentials before doing anything else,
       // so we have don't have issues at any point.
       setupCredentials()
@@ -116,6 +132,8 @@ private[spark] class Client(
       val newApp = yarnClient.createApplication()
       val newAppResponse = newApp.getNewApplicationResponse()
       appId = newAppResponse.getApplicationId()
+      reportLauncherState(SparkAppHandle.State.SUBMITTED)
+      launcherBackend.setAppId(appId.toString())
 
       // Verify whether the cluster has enough resources for our AM
       verifyClusterResources(newAppResponse)
@@ -881,6 +899,20 @@ private[spark] class Client(
         }
       }
 
+      if (lastState != state) {
+        state match {
+          case YarnApplicationState.RUNNING =>
+            reportLauncherState(SparkAppHandle.State.RUNNING)
+          case YarnApplicationState.FINISHED =>
+            reportLauncherState(SparkAppHandle.State.FINISHED)
+          case YarnApplicationState.FAILED =>
+            reportLauncherState(SparkAppHandle.State.FAILED)
+          case YarnApplicationState.KILLED =>
+            reportLauncherState(SparkAppHandle.State.KILLED)
+          case _ =>
+        }
+      }
+
       if (state == YarnApplicationState.FINISHED ||
         state == YarnApplicationState.FAILED ||
         state == YarnApplicationState.KILLED) {
@@ -928,8 +960,8 @@ private[spark] class Client(
    * throw an appropriate SparkException.
    */
   def run(): Unit = {
-    val appId = submitApplication()
-    if (fireAndForget) {
+    this.appId = submitApplication()
+    if (!launcherBackend.isConnected() && fireAndForget) {
       val report = getApplicationReport(appId)
       val state = report.getYarnApplicationState
       logInfo(s"Application report for $appId (state: $state)")
@@ -971,6 +1003,7 @@ private[spark] class Client(
 }
 
 object Client extends Logging {
+
   def main(argStrings: Array[String]) {
     if (!sys.props.contains("SPARK_SUBMIT")) {
       logWarning("WARNING: This client is deprecated and will be removed in a " +
diff --git a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
index 36d5759554d98..20771f655473c 100644
--- a/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
+++ b/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnClientSchedulerBackend.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.yarn.api.records.{ApplicationId, YarnApplicationState}
 
 import org.apache.spark.{SparkException, Logging, SparkContext}
 import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
+import org.apache.spark.launcher.SparkAppHandle
 import org.apache.spark.scheduler.TaskSchedulerImpl
 
 private[spark] class YarnClientSchedulerBackend(
@@ -177,6 +178,15 @@ private[spark] class YarnClientSchedulerBackend(
     if (monitorThread != null) {
       monitorThread.stopMonitor()
     }
+
+    // Report a final state to the launcher if one is connected. This is needed since in client
+    // mode this backend doesn't let the app monitor loop run to completion, so it does not report
+    // the final state itself.
+    //
+    // Note: there's not enough information at this point to provide a better final state,
+    // so assume the application was successful.
+    client.reportLauncherState(SparkAppHandle.State.FINISHED)
+
     super.stop()
     YarnSparkHadoopUtil.get.stopExecutorDelegationTokenRenewer()
     client.stop()
diff --git a/yarn/src/test/resources/log4j.properties b/yarn/src/test/resources/log4j.properties
index 6b8a5dbf6373e..6b9a799954bf1 100644
--- a/yarn/src/test/resources/log4j.properties
+++ b/yarn/src/test/resources/log4j.properties
@@ -23,6 +23,9 @@ log4j.appender.file.file=target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
 
-# Ignore messages below warning level from Jetty, because it's a bit verbose
-log4j.logger.org.spark-project.jetty=WARN
+# Ignore messages below warning level from a few verbose libraries.
+log4j.logger.com.sun.jersey=WARN
 log4j.logger.org.apache.hadoop=WARN
+log4j.logger.org.eclipse.jetty=WARN
+log4j.logger.org.mortbay=WARN
+log4j.logger.org.spark-project.jetty=WARN
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
index 17c59ff06e0c1..12494b01054ba 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/BaseYarnClusterSuite.scala
@@ -22,15 +22,18 @@ import java.util.Properties
 import java.util.concurrent.TimeUnit
 
 import scala.collection.JavaConverters._
+import scala.concurrent.duration._
+import scala.language.postfixOps
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.Files
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.apache.hadoop.yarn.server.MiniYARNCluster
 import org.scalatest.{BeforeAndAfterAll, Matchers}
+import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
-import org.apache.spark.launcher.TestClasspathBuilder
+import org.apache.spark.launcher._
 import org.apache.spark.util.Utils
 
 abstract class BaseYarnClusterSuite
@@ -46,13 +49,14 @@ abstract class BaseYarnClusterSuite
     |log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
     |log4j.logger.org.apache.hadoop=WARN
     |log4j.logger.org.eclipse.jetty=WARN
+    |log4j.logger.org.mortbay=WARN
     |log4j.logger.org.spark-project.jetty=WARN
     """.stripMargin
 
   private var yarnCluster: MiniYARNCluster = _
   protected var tempDir: File = _
   private var fakeSparkJar: File = _
-  private var hadoopConfDir: File = _
+  protected var hadoopConfDir: File = _
   private var logConfDir: File = _
 
   def newYarnConfig(): YarnConfiguration
@@ -120,15 +124,77 @@ abstract class BaseYarnClusterSuite
       clientMode: Boolean,
       klass: String,
       appArgs: Seq[String] = Nil,
-      sparkArgs: Seq[String] = Nil,
+      sparkArgs: Seq[(String, String)] = Nil,
       extraClassPath: Seq[String] = Nil,
       extraJars: Seq[String] = Nil,
       extraConf: Map[String, String] = Map(),
-      extraEnv: Map[String, String] = Map()): Unit = {
+      extraEnv: Map[String, String] = Map()): SparkAppHandle.State = {
     val master = if (clientMode) "yarn-client" else "yarn-cluster"
-    val props = new Properties()
+    val propsFile = createConfFile(extraClassPath = extraClassPath, extraConf = extraConf)
+    val env = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()) ++ extraEnv
+
+    val launcher = new SparkLauncher(env.asJava)
+    if (klass.endsWith(".py")) {
+      launcher.setAppResource(klass)
+    } else {
+      launcher.setMainClass(klass)
+      launcher.setAppResource(fakeSparkJar.getAbsolutePath())
+    }
+    launcher.setSparkHome(sys.props("spark.test.home"))
+      .setMaster(master)
+      .setConf("spark.executor.instances", "1")
+      .setPropertiesFile(propsFile)
+      .addAppArgs(appArgs.toArray: _*)
+
+    sparkArgs.foreach { case (name, value) =>
+      if (value != null) {
+        launcher.addSparkArg(name, value)
+      } else {
+        launcher.addSparkArg(name)
+      }
+    }
+    extraJars.foreach(launcher.addJar)
 
-    props.setProperty("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
+    val handle = launcher.startApplication()
+    try {
+      eventually(timeout(2 minutes), interval(1 second)) {
+        assert(handle.getState().isFinal())
+      }
+    } finally {
+      handle.kill()
+    }
+
+    handle.getState()
+  }
+
+  /**
+   * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
+   * any sort of error when the job process finishes successfully, but the job itself fails. So
+   * the tests enforce that something is written to a file after everything is ok to indicate
+   * that the job succeeded.
+   */
+  protected def checkResult(finalState: SparkAppHandle.State, result: File): Unit = {
+    checkResult(finalState, result, "success")
+  }
+
+  protected def checkResult(
+      finalState: SparkAppHandle.State,
+      result: File,
+      expected: String): Unit = {
+    finalState should be (SparkAppHandle.State.FINISHED)
+    val resultString = Files.toString(result, UTF_8)
+    resultString should be (expected)
+  }
+
+  protected def mainClassName(klass: Class[_]): String = {
+    klass.getName().stripSuffix("$")
+  }
+
+  protected def createConfFile(
+      extraClassPath: Seq[String] = Nil,
+      extraConf: Map[String, String] = Map()): String = {
+    val props = new Properties()
+    props.put("spark.yarn.jar", "local:" + fakeSparkJar.getAbsolutePath())
 
     val testClasspath = new TestClasspathBuilder()
       .buildClassPath(
@@ -138,69 +204,28 @@ abstract class BaseYarnClusterSuite
       .asScala
       .mkString(File.pathSeparator)
 
-    props.setProperty("spark.driver.extraClassPath", testClasspath)
-    props.setProperty("spark.executor.extraClassPath", testClasspath)
+    props.put("spark.driver.extraClassPath", testClasspath)
+    props.put("spark.executor.extraClassPath", testClasspath)
 
     // SPARK-4267: make sure java options are propagated correctly.
     props.setProperty("spark.driver.extraJavaOptions", "-Dfoo=\"one two three\"")
     props.setProperty("spark.executor.extraJavaOptions", "-Dfoo=\"one two three\"")
 
-    yarnCluster.getConfig.asScala.foreach { e =>
+    yarnCluster.getConfig().asScala.foreach { e =>
       props.setProperty("spark.hadoop." + e.getKey(), e.getValue())
     }
-
     sys.props.foreach { case (k, v) =>
       if (k.startsWith("spark.")) {
         props.setProperty(k, v)
       }
     }
-
     extraConf.foreach { case (k, v) => props.setProperty(k, v) }
 
     val propsFile = File.createTempFile("spark", ".properties", tempDir)
     val writer = new OutputStreamWriter(new FileOutputStream(propsFile), UTF_8)
     props.store(writer, "Spark properties.")
     writer.close()
-
-    val extraJarArgs = if (extraJars.nonEmpty) Seq("--jars", extraJars.mkString(",")) else Nil
-    val mainArgs =
-      if (klass.endsWith(".py")) {
-        Seq(klass)
-      } else {
-        Seq("--class", klass, fakeSparkJar.getAbsolutePath())
-      }
-    val argv =
-      Seq(
-        new File(sys.props("spark.test.home"), "bin/spark-submit").getAbsolutePath(),
-        "--master", master,
-        "--num-executors", "1",
-        "--properties-file", propsFile.getAbsolutePath()) ++
-      extraJarArgs ++
-      sparkArgs ++
-      mainArgs ++
-      appArgs
-
-    Utils.executeAndGetOutput(argv,
-      extraEnvironment = Map("YARN_CONF_DIR" -> hadoopConfDir.getAbsolutePath()) ++ extraEnv)
-  }
-
-  /**
-   * This is a workaround for an issue with yarn-cluster mode: the Client class will not provide
-   * any sort of error when the job process finishes successfully, but the job itself fails. So
-   * the tests enforce that something is written to a file after everything is ok to indicate
-   * that the job succeeded.
-   */
-  protected def checkResult(result: File): Unit = {
-    checkResult(result, "success")
-  }
-
-  protected def checkResult(result: File, expected: String): Unit = {
-    val resultString = Files.toString(result, UTF_8)
-    resultString should be (expected)
-  }
-
-  protected def mainClassName(klass: Class[_]): String = {
-    klass.getName().stripSuffix("$")
+    propsFile.getAbsolutePath()
   }
 
 }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index f1601cd16100f..d1cd0c89b5d38 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -19,16 +19,20 @@ package org.apache.spark.deploy.yarn
 
 import java.io.File
 import java.net.URL
+import java.util.{HashMap => JHashMap, Properties}
 
 import scala.collection.mutable
+import scala.concurrent.duration._
+import scala.language.postfixOps
 
 import com.google.common.base.Charsets.UTF_8
 import com.google.common.io.{ByteStreams, Files}
 import org.apache.hadoop.yarn.conf.YarnConfiguration
 import org.scalatest.Matchers
+import org.scalatest.concurrent.Eventually._
 
 import org.apache.spark._
-import org.apache.spark.launcher.TestClasspathBuilder
+import org.apache.spark.launcher._
 import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationStart,
   SparkListenerExecutorAdded}
 import org.apache.spark.scheduler.cluster.ExecutorInfo
@@ -82,10 +86,8 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
 
   test("run Spark in yarn-cluster mode unsuccessfully") {
     // Don't provide arguments so the driver will fail.
-    val exception = intercept[SparkException] {
-      runSpark(false, mainClassName(YarnClusterDriver.getClass))
-      fail("Spark application should have failed.")
-    }
+    val finalState = runSpark(false, mainClassName(YarnClusterDriver.getClass))
+    finalState should be (SparkAppHandle.State.FAILED)
   }
 
   test("run Python application in yarn-client mode") {
@@ -104,11 +106,42 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     testUseClassPathFirst(false)
   }
 
+  test("monitor app using launcher library") {
+    val env = new JHashMap[String, String]()
+    env.put("YARN_CONF_DIR", hadoopConfDir.getAbsolutePath())
+
+    val propsFile = createConfFile()
+    val handle = new SparkLauncher(env)
+      .setSparkHome(sys.props("spark.test.home"))
+      .setConf("spark.ui.enabled", "false")
+      .setPropertiesFile(propsFile)
+      .setMaster("yarn-client")
+      .setAppResource("spark-internal")
+      .setMainClass(mainClassName(YarnLauncherTestApp.getClass))
+      .startApplication()
+
+    try {
+      eventually(timeout(30 seconds), interval(100 millis)) {
+        handle.getState() should be (SparkAppHandle.State.RUNNING)
+      }
+
+      handle.getAppId() should not be (null)
+      handle.getAppId() should startWith ("application_")
+      handle.stop()
+
+      eventually(timeout(30 seconds), interval(100 millis)) {
+        handle.getState() should be (SparkAppHandle.State.KILLED)
+      }
+    } finally {
+      handle.kill()
+    }
+  }
+
   private def testBasicYarnApp(clientMode: Boolean): Unit = {
     val result = File.createTempFile("result", null, tempDir)
-    runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
+    val finalState = runSpark(clientMode, mainClassName(YarnClusterDriver.getClass),
       appArgs = Seq(result.getAbsolutePath()))
-    checkResult(result)
+    checkResult(finalState, result)
   }
 
   private def testPySpark(clientMode: Boolean): Unit = {
@@ -143,11 +176,11 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     val pyFiles = Seq(pyModule.getAbsolutePath(), mod2Archive.getPath()).mkString(",")
     val result = File.createTempFile("result", null, tempDir)
 
-    runSpark(clientMode, primaryPyFile.getAbsolutePath(),
-      sparkArgs = Seq("--py-files", pyFiles),
+    val finalState = runSpark(clientMode, primaryPyFile.getAbsolutePath(),
+      sparkArgs = Seq("--py-files" -> pyFiles),
       appArgs = Seq(result.getAbsolutePath()),
       extraEnv = extraEnv)
-    checkResult(result)
+    checkResult(finalState, result)
   }
 
   private def testUseClassPathFirst(clientMode: Boolean): Unit = {
@@ -156,15 +189,15 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     val userJar = TestUtils.createJarWithFiles(Map("test.resource" -> "OVERRIDDEN"), tempDir)
     val driverResult = File.createTempFile("driver", null, tempDir)
     val executorResult = File.createTempFile("executor", null, tempDir)
-    runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
+    val finalState = runSpark(clientMode, mainClassName(YarnClasspathTest.getClass),
       appArgs = Seq(driverResult.getAbsolutePath(), executorResult.getAbsolutePath()),
       extraClassPath = Seq(originalJar.getPath()),
       extraJars = Seq("local:" + userJar.getPath()),
       extraConf = Map(
         "spark.driver.userClassPathFirst" -> "true",
         "spark.executor.userClassPathFirst" -> "true"))
-    checkResult(driverResult, "OVERRIDDEN")
-    checkResult(executorResult, "OVERRIDDEN")
+    checkResult(finalState, driverResult, "OVERRIDDEN")
+    checkResult(finalState, executorResult, "OVERRIDDEN")
   }
 
 }
@@ -211,8 +244,8 @@ private object YarnClusterDriver extends Logging with Matchers {
       data should be (Set(1, 2, 3, 4))
       result = "success"
     } finally {
-      sc.stop()
       Files.write(result, status, UTF_8)
+      sc.stop()
     }
 
     // verify log urls are present
@@ -297,3 +330,18 @@ private object YarnClasspathTest extends Logging {
   }
 
 }
+
+private object YarnLauncherTestApp {
+
+  def main(args: Array[String]): Unit = {
+    // Do not stop the application; the test will stop it using the launcher lib. Just run a task
+    // that will prevent the process from exiting.
+    val sc = new SparkContext(new SparkConf())
+    sc.parallelize(Seq(1)).foreach { i =>
+      this.synchronized {
+        wait()
+      }
+    }
+  }
+
+}
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
index a85e5772a0fa4..c17e8695c24fb 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnShuffleIntegrationSuite.scala
@@ -53,7 +53,7 @@ class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
 
     logInfo("Shuffle service port = " + shuffleServicePort)
     val result = File.createTempFile("result", null, tempDir)
-    runSpark(
+    val finalState = runSpark(
       false,
       mainClassName(YarnExternalShuffleDriver.getClass),
       appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath),
@@ -62,7 +62,7 @@ class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite {
         "spark.shuffle.service.port" -> shuffleServicePort.toString
       )
     )
-    checkResult(result)
+    checkResult(finalState, result)
     assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists())
   }
 }

From 12b7191d2075ae870c73529de450cbb5725872ec Mon Sep 17 00:00:00 2001
From: Rick Hillegas <rhilleg@us.ibm.com>
Date: Fri, 9 Oct 2015 13:36:51 -0700
Subject: [PATCH 016/139] [SPARK-10855] [SQL] Add a JDBC dialect for Apache
 Derby

marmbrus
rxin

This patch adds a JdbcDialect class, which customizes the datatype mappings for Derby backends. The patch also adds unit tests for the new dialect, corresponding to the existing tests for other JDBC dialects.

JDBCSuite runs cleanly for me with this patch. So does JDBCWriteSuite, although it produces noise as described here: https://issues.apache.org/jira/browse/SPARK-10890

This patch is my original work, which I license to the ASF. I am a Derby contributor, so my ICLA is on file under SVN id "rhillegas": http://people.apache.org/committer-index.html

Touches the following files:

---------------------------------

org.apache.spark.sql.jdbc.JdbcDialects

Adds a DerbyDialect.

---------------------------------

org.apache.spark.sql.jdbc.JDBCSuite

Adds unit tests for the new DerbyDialect.

Author: Rick Hillegas <rhilleg@us.ibm.com>

Closes #8982 from rick-ibm/b_10855.
---
 .../apache/spark/sql/jdbc/JdbcDialects.scala  | 28 +++++++++++++++++++
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 14 +++++++++-
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index 0cd356f222984..a2ff4cc1c91f9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -138,6 +138,7 @@ object JdbcDialects {
   registerDialect(PostgresDialect)
   registerDialect(DB2Dialect)
   registerDialect(MsSqlServerDialect)
+  registerDialect(DerbyDialect)
 
 
   /**
@@ -287,3 +288,30 @@ case object MsSqlServerDialect extends JdbcDialect {
     case _ => None
   }
 }
+
+/**
+ * :: DeveloperApi ::
+ * Default Apache Derby dialect, mapping real on read
+ * and string/byte/short/boolean/decimal on write.
+ */
+@DeveloperApi
+case object DerbyDialect extends JdbcDialect {
+  override def canHandle(url: String): Boolean = url.startsWith("jdbc:derby")
+  override def getCatalystType(
+      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
+    if (sqlType == Types.REAL) Option(FloatType) else None
+  }
+
+  override def getJDBCType(dt: DataType): Option[JdbcType] = dt match {
+    case StringType => Some(JdbcType("CLOB", java.sql.Types.CLOB))
+    case ByteType => Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT))
+    case ShortType => Some(JdbcType("SMALLINT", java.sql.Types.SMALLINT))
+    case BooleanType => Some(JdbcType("BOOLEAN", java.sql.Types.BOOLEAN))
+    // 31 is the maximum precision and 5 is the default scale for a Derby DECIMAL
+    case (t: DecimalType) if (t.precision > 31) =>
+      Some(JdbcType("DECIMAL(31,5)", java.sql.Types.DECIMAL))
+    case _ => None
+  }
+
+}
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index bbf705ce95933..d530b1a469ce2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -409,18 +409,22 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(JdbcDialects.get("jdbc:postgresql://127.0.0.1/db") == PostgresDialect)
     assert(JdbcDialects.get("jdbc:db2://127.0.0.1/db") == DB2Dialect)
     assert(JdbcDialects.get("jdbc:sqlserver://127.0.0.1/db") == MsSqlServerDialect)
+    assert(JdbcDialects.get("jdbc:derby:db") == DerbyDialect)
     assert(JdbcDialects.get("test.invalid") == NoopDialect)
   }
 
   test("quote column names by jdbc dialect") {
     val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
+    val Derby = JdbcDialects.get("jdbc:derby:db")
 
     val columns = Seq("abc", "key")
     val MySQLColumns = columns.map(MySQL.quoteIdentifier(_))
     val PostgresColumns = columns.map(Postgres.quoteIdentifier(_))
+    val DerbyColumns = columns.map(Derby.quoteIdentifier(_))
     assert(MySQLColumns === Seq("`abc`", "`key`"))
     assert(PostgresColumns === Seq(""""abc"""", """"key""""))
+    assert(DerbyColumns === Seq(""""abc"""", """"key""""))
   }
 
   test("Dialect unregister") {
@@ -454,16 +458,23 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
 
   test("PostgresDialect type mapping") {
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
-    // SPARK-7869: Testing JSON types handling
     assert(Postgres.getCatalystType(java.sql.Types.OTHER, "json", 1, null) === Some(StringType))
     assert(Postgres.getCatalystType(java.sql.Types.OTHER, "jsonb", 1, null) === Some(StringType))
   }
 
+  test("DerbyDialect jdbc type mapping") {
+    val derbyDialect = JdbcDialects.get("jdbc:derby:db")
+    assert(derbyDialect.getJDBCType(StringType).map(_.databaseTypeDefinition).get == "CLOB")
+    assert(derbyDialect.getJDBCType(ByteType).map(_.databaseTypeDefinition).get == "SMALLINT")
+    assert(derbyDialect.getJDBCType(BooleanType).map(_.databaseTypeDefinition).get == "BOOLEAN")
+  }
+
   test("table exists query by jdbc dialect") {
     val MySQL = JdbcDialects.get("jdbc:mysql://127.0.0.1/db")
     val Postgres = JdbcDialects.get("jdbc:postgresql://127.0.0.1/db")
     val db2 = JdbcDialects.get("jdbc:db2://127.0.0.1/db")
     val h2 = JdbcDialects.get(url)
+    val derby = JdbcDialects.get("jdbc:derby:db")
     val table = "weblogs"
     val defaultQuery = s"SELECT * FROM $table WHERE 1=0"
     val limitQuery = s"SELECT 1 FROM $table LIMIT 1"
@@ -471,5 +482,6 @@ class JDBCSuite extends SparkFunSuite with BeforeAndAfter with SharedSQLContext
     assert(Postgres.getTableExistsQuery(table) == limitQuery)
     assert(db2.getTableExistsQuery(table) == defaultQuery)
     assert(h2.getTableExistsQuery(table) == defaultQuery)
+    assert(derby.getTableExistsQuery(table) == defaultQuery)
   }
 }

From 63c340a710b24869410d56602b712fbfe443e6f0 Mon Sep 17 00:00:00 2001
From: Tom Graves <tgraves@yahoo-inc.com>
Date: Fri, 9 Oct 2015 14:06:25 -0700
Subject: [PATCH 017/139] [SPARK-10858] YARN: archives/jar/files rename with #
 doesn't work unl

https://issues.apache.org/jira/browse/SPARK-10858

The issue here is that in resolveURI we default to calling new File(path).getAbsoluteFile().toURI().  But if the path passed in already has a # in it then File(path) will think that is supposed to be part of the actual file path and not a fragment so it changes # to %23. Then when we try to parse that  later in Client as a URI it doesn't recognize there is a fragment.

so to fix we just check if there is a fragment, still create the File like we did before and then add the fragment back on.

Author: Tom Graves <tgraves@yahoo-inc.com>

Closes #9035 from tgravescs/SPARK-10858.
---
 core/src/main/scala/org/apache/spark/util/Utils.scala      | 7 +++++++
 core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 2bab4af2e73ab..e60c1b355a73e 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1749,6 +1749,13 @@ private[spark] object Utils extends Logging {
       if (uri.getScheme() != null) {
         return uri
       }
+      // make sure to handle if the path has a fragment (applies to yarn
+      // distributed cache)
+      if (uri.getFragment() != null) {
+        val absoluteURI = new File(uri.getPath()).getAbsoluteFile().toURI()
+        return new URI(absoluteURI.getScheme(), absoluteURI.getHost(), absoluteURI.getPath(),
+          uri.getFragment())
+      }
     } catch {
       case e: URISyntaxException =>
     }
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 1fb81ad565b41..68b0da76bc134 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -384,7 +384,7 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assertResolves("hdfs:/root/spark.jar", "hdfs:/root/spark.jar")
     assertResolves("hdfs:///root/spark.jar#app.jar", "hdfs:/root/spark.jar#app.jar")
     assertResolves("spark.jar", s"file:$cwd/spark.jar")
-    assertResolves("spark.jar#app.jar", s"file:$cwd/spark.jar%23app.jar")
+    assertResolves("spark.jar#app.jar", s"file:$cwd/spark.jar#app.jar")
     assertResolves("path to/file.txt", s"file:$cwd/path%20to/file.txt")
     if (Utils.isWindows) {
       assertResolves("C:\\path\\to\\file.txt", "file:/C:/path/to/file.txt")
@@ -414,10 +414,10 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     assertResolves("file:/jar1,file:/jar2", "file:/jar1,file:/jar2")
     assertResolves("hdfs:/jar1,file:/jar2,jar3", s"hdfs:/jar1,file:/jar2,file:$cwd/jar3")
     assertResolves("hdfs:/jar1,file:/jar2,jar3,jar4#jar5,path to/jar6",
-      s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:$cwd/jar4%23jar5,file:$cwd/path%20to/jar6")
+      s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:$cwd/jar4#jar5,file:$cwd/path%20to/jar6")
     if (Utils.isWindows) {
       assertResolves("""hdfs:/jar1,file:/jar2,jar3,C:\pi.py#py.pi,C:\path to\jar4""",
-        s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:/C:/pi.py%23py.pi,file:/C:/path%20to/jar4")
+        s"hdfs:/jar1,file:/jar2,file:$cwd/jar3,file:/C:/pi.py#py.pi,file:/C:/path%20to/jar4")
     }
   }
 

From c1b4ce43264fa8b9945df3c599a51d4d2a675705 Mon Sep 17 00:00:00 2001
From: Vladimir Vladimirov <vladimir.vladimirov@magnetic.com>
Date: Fri, 9 Oct 2015 14:16:13 -0700
Subject: [PATCH 018/139] [SPARK-10535] Sync up API for matrix factorization
 model between Scala and PySpark

Support for recommendUsersForProducts and recommendProductsForUsers in matrix factorization model for PySpark

Author: Vladimir Vladimirov <vladimir.vladimirov@magnetic.com>

Closes #8700 from smartkiwi/SPARK-10535_.
---
 .../MatrixFactorizationModelWrapper.scala     |  8 +++++
 python/pyspark/mllib/recommendation.py        | 32 ++++++++++++++++---
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala
index 534edac56bc5a..eeb7cba882ce2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/MatrixFactorizationModelWrapper.scala
@@ -42,4 +42,12 @@ private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorization
       case (product, feature) => (product, Vectors.dense(feature))
     }.asInstanceOf[RDD[(Any, Any)]])
   }
+
+  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
+    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
+  }
+
+  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
+    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
+  }
 }
diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py
index 95047b5b7b4b7..b9442b0d16c0f 100644
--- a/python/pyspark/mllib/recommendation.py
+++ b/python/pyspark/mllib/recommendation.py
@@ -76,16 +76,28 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader):
 
     >>> first_user = model.userFeatures().take(1)[0]
     >>> latents = first_user[1]
-    >>> len(latents) == 4
-    True
+    >>> len(latents)
+    4
 
     >>> model.productFeatures().collect()
     [(1, array('d', [...])), (2, array('d', [...]))]
 
     >>> first_product = model.productFeatures().take(1)[0]
     >>> latents = first_product[1]
-    >>> len(latents) == 4
-    True
+    >>> len(latents)
+    4
+
+    >>> products_for_users = model.recommendProductsForUsers(1).collect()
+    >>> len(products_for_users)
+    2
+    >>> products_for_users[0]
+    (1, (Rating(user=1, product=2, rating=...),))
+
+    >>> users_for_products = model.recommendUsersForProducts(1).collect()
+    >>> len(users_for_products)
+    2
+    >>> users_for_products[0]
+    (1, (Rating(user=2, product=1, rating=...),))
 
     >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10)
     >>> model.predict(2, 2)
@@ -166,6 +178,18 @@ def recommendProducts(self, user, num):
         """
         return list(self.call("recommendProducts", user, num))
 
+    def recommendProductsForUsers(self, num):
+        """
+        Recommends top "num" products for all users. The number returned may be less than this.
+        """
+        return self.call("wrappedRecommendProductsForUsers", num)
+
+    def recommendUsersForProducts(self, num):
+        """
+        Recommends top "num" users for all products. The number returned may be less than this.
+        """
+        return self.call("wrappedRecommendUsersForProducts", num)
+
     @property
     @since("1.4.0")
     def rank(self):

From 864de3bf4041c829e95d278b9569e91448bab0cc Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Fri, 9 Oct 2015 23:05:38 -0700
Subject: [PATCH 019/139] [SPARK-10079] [SPARKR] Make 'column' and 'col'
 functions be S4 functions.

1.  Add a "col" function into DataFrame.
2.  Move the current "col" function in Column.R to functions.R, convert it to S4 function.
3.  Add a s4 "column" function in functions.R.
4.  Convert the "column" function in Column.R to S4 function. This is for private use.

Author: Sun Rui <rui.sun@intel.com>

Closes #8864 from sun-rui/SPARK-10079.
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/column.R                 | 12 +++++-------
 R/pkg/R/functions.R              | 22 ++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R |  4 ++--
 5 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 255be2e76ff49..95d949ee3e5a4 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -107,6 +107,7 @@ exportMethods("%in%",
               "cbrt",
               "ceil",
               "ceiling",
+              "column",
               "concat",
               "concat_ws",
               "contains",
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index 42e9d12179db7..20de3907b7dd9 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -36,13 +36,11 @@ setMethod("initialize", "Column", function(.Object, jc) {
   .Object
 })
 
-column <- function(jc) {
-  new("Column", jc)
-}
-
-col <- function(x) {
-  column(callJStatic("org.apache.spark.sql.functions", "col", x))
-}
+setMethod("column",
+          signature(x = "jobj"),
+          function(x) {
+            new("Column", x)
+          })
 
 #' @rdname show
 #' @name show
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 94687edb05442..a220ad8b9f58b 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -233,6 +233,28 @@ setMethod("ceil",
             column(jc)
           })
 
+#' Though scala functions has "col" function, we don't expose it in SparkR
+#' because we don't want to conflict with the "col" function in the R base
+#' package and we also have "column" function exported which is an alias of "col".
+col <- function(x) {
+  column(callJStatic("org.apache.spark.sql.functions", "col", x))
+}
+
+#' column
+#'
+#' Returns a Column based on the given column name.
+#'
+#' @rdname col
+#' @name column
+#' @family normal_funcs
+#' @export
+#' @examples \dontrun{column(df)}
+setMethod("column",
+          signature(x = "character"),
+          function(x) {
+            col(x)
+          })
+
 #' cos
 #'
 #' Computes the cosine of the given value.
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c4474131804bb..8fad17026c06f 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -686,6 +686,10 @@ setGeneric("cbrt", function(x) { standardGeneric("cbrt") })
 #' @export
 setGeneric("ceil", function(x) { standardGeneric("ceil") })
 
+#' @rdname col
+#' @export
+setGeneric("column", function(x) { standardGeneric("column") })
+
 #' @rdname concat
 #' @export
 setGeneric("concat", function(x, ...) { standardGeneric("concat") })
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 4804ecf177341..3a04edbb4c116 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -787,7 +787,7 @@ test_that("test HiveContext", {
 })
 
 test_that("column operators", {
-  c <- SparkR:::col("a")
+  c <- column("a")
   c2 <- (- c + 1 - 2) * 3 / 4.0
   c3 <- (c + c2 - c2) * c2 %% c2
   c4 <- (c > c2) & (c2 <= c3) | (c == c2) & (c2 != c3)
@@ -795,7 +795,7 @@ test_that("column operators", {
 })
 
 test_that("column functions", {
-  c <- SparkR:::col("a")
+  c <- column("a")
   c1 <- abs(c) + acos(c) + approxCountDistinct(c) + ascii(c) + asin(c) + atan(c)
   c2 <- avg(c) + base64(c) + bin(c) + bitwiseNOT(c) + cbrt(c) + ceil(c) + cos(c)
   c3 <- cosh(c) + count(c) + crc32(c) + exp(c)

From a16396df76cc27099011bfb96b28cbdd7f964ca8 Mon Sep 17 00:00:00 2001
From: Jacker Hu <gt.hu.chang@gmail.com>
Date: Sat, 10 Oct 2015 11:36:18 +0100
Subject: [PATCH 020/139] [SPARK-10772] [STREAMING] [SCALA]
 NullPointerException when transform function in DStream returns NULL

Currently, the ```TransformedDStream``` will using ```Some(transformFunc(parentRDDs, validTime))``` as compute return value, when the ```transformFunc``` somehow returns null as return value, the followed operator will have NullPointerExeception.

This fix uses the ```Option()``` instead of ```Some()``` to deal with the possible null value. When   ```transformFunc``` returns ```null```, the option will transform null to ```None```, the downstream can handle ```None``` correctly.

NOTE (2015-09-25): The latest fix will check the return value of transform function, if it is ```NULL```, a spark exception will be thrown out

Author: Jacker Hu <gt.hu.chang@gmail.com>
Author: jhu-chang <gt.hu.chang@gmail.com>

Closes #8881 from jhu-chang/Fix_Transform.
---
 .../streaming/dstream/TransformedDStream.scala      | 12 ++++++++++--
 .../spark/streaming/BasicOperationsSuite.scala      | 13 +++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
index 5d46ca0715ffd..ab01f47d5cf99 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.streaming.dstream
 
+import scala.reflect.ClassTag
+
+import org.apache.spark.SparkException
 import org.apache.spark.rdd.{PairRDDFunctions, RDD}
 import org.apache.spark.streaming.{Duration, Time}
-import scala.reflect.ClassTag
 
 private[streaming]
 class TransformedDStream[U: ClassTag] (
@@ -38,6 +40,12 @@ class TransformedDStream[U: ClassTag] (
 
   override def compute(validTime: Time): Option[RDD[U]] = {
     val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
-    Some(transformFunc(parentRDDs, validTime))
+    val transformedRDD = transformFunc(parentRDDs, validTime)
+    if (transformedRDD == null) {
+      throw new SparkException("Transform function must not return null. " +
+        "Return SparkContext.emptyRDD() instead to represent no element " +
+        "as the result of transformation.")
+    }
+    Some(transformedRDD)
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index 255376807c957..9988f410f0bc1 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -211,6 +211,19 @@ class BasicOperationsSuite extends TestSuiteBase {
     )
   }
 
+  test("transform with NULL") {
+    val input = Seq(1 to 4)
+    intercept[SparkException] {
+      testOperation(
+        input,
+        (r: DStream[Int]) => r.transform(rdd => null.asInstanceOf[RDD[Int]]),
+        Seq(Seq()),
+        1,
+        false
+      )
+    }
+  }
+
   test("transformWith") {
     val inputData1 = Seq( Seq("a", "b"), Seq("a", ""), Seq(""), Seq() )
     val inputData2 = Seq( Seq("a", "b"), Seq("b", ""), Seq(), Seq("")   )

From 595012ea8b9c6afcc2fc024d5a5e198df765bd75 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Sun, 11 Oct 2015 18:11:08 -0700
Subject: [PATCH 021/139] [SPARK-11053] Remove use of KVIterator in
 SortBasedAggregationIterator

SortBasedAggregationIterator uses a KVIterator interface in order to process input rows as key-value pairs, but this use of KVIterator is unnecessary, slightly complicates the code, and might hurt performance. This patch refactors this code to remove the use of this extra layer of iterator wrapping and simplifies other parts of the code in the process.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9066 from JoshRosen/sort-iterator-cleanup.
---
 .../aggregate/AggregationIterator.scala       | 83 -----------------
 .../aggregate/SortBasedAggregate.scala        | 20 +++--
 .../SortBasedAggregationIterator.scala        | 89 +++++--------------
 3 files changed, 33 insertions(+), 159 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index 5f7341e88c7c9..8e0fbd109b413 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -21,7 +21,6 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.unsafe.KVIterator
 
 import scala.collection.mutable.ArrayBuffer
 
@@ -412,85 +411,3 @@ abstract class AggregationIterator(
    */
   protected def newBuffer: MutableRow
 }
-
-object AggregationIterator {
-  def kvIterator(
-    groupingExpressions: Seq[NamedExpression],
-    newProjection: (Seq[Expression], Seq[Attribute]) => Projection,
-    inputAttributes: Seq[Attribute],
-    inputIter: Iterator[InternalRow]): KVIterator[InternalRow, InternalRow] = {
-    new KVIterator[InternalRow, InternalRow] {
-      private[this] val groupingKeyGenerator = newProjection(groupingExpressions, inputAttributes)
-
-      private[this] var groupingKey: InternalRow = _
-
-      private[this] var value: InternalRow = _
-
-      override def next(): Boolean = {
-        if (inputIter.hasNext) {
-          // Read the next input row.
-          val inputRow = inputIter.next()
-          // Get groupingKey based on groupingExpressions.
-          groupingKey = groupingKeyGenerator(inputRow)
-          // The value is the inputRow.
-          value = inputRow
-          true
-        } else {
-          false
-        }
-      }
-
-      override def getKey(): InternalRow = {
-        groupingKey
-      }
-
-      override def getValue(): InternalRow = {
-        value
-      }
-
-      override def close(): Unit = {
-        // Do nothing
-      }
-    }
-  }
-
-  def unsafeKVIterator(
-      groupingExpressions: Seq[NamedExpression],
-      inputAttributes: Seq[Attribute],
-      inputIter: Iterator[InternalRow]): KVIterator[UnsafeRow, InternalRow] = {
-    new KVIterator[UnsafeRow, InternalRow] {
-      private[this] val groupingKeyGenerator =
-        UnsafeProjection.create(groupingExpressions, inputAttributes)
-
-      private[this] var groupingKey: UnsafeRow = _
-
-      private[this] var value: InternalRow = _
-
-      override def next(): Boolean = {
-        if (inputIter.hasNext) {
-          // Read the next input row.
-          val inputRow = inputIter.next()
-          // Get groupingKey based on groupingExpressions.
-          groupingKey = groupingKeyGenerator.apply(inputRow)
-          // The value is the inputRow.
-          value = inputRow
-          true
-        } else {
-          false
-        }
-      }
-
-      override def getKey(): UnsafeRow = {
-        groupingKey
-      }
-
-      override def getValue(): InternalRow = {
-        value
-      }
-
-      override def close(): Unit = {
-        // Do nothing
-      }
-    }
-  }
-}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala
index f4c14a9b3556f..4d37106e007f5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregate.scala
@@ -23,9 +23,8 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
 import org.apache.spark.sql.catalyst.plans.physical.{UnspecifiedDistribution, ClusteredDistribution, AllTuples, Distribution}
-import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan, UnaryNode}
+import org.apache.spark.sql.execution.{SparkPlan, UnaryNode}
 import org.apache.spark.sql.execution.metric.SQLMetrics
-import org.apache.spark.sql.types.StructType
 
 case class SortBasedAggregate(
     requiredChildDistributionExpressions: Option[Seq[Expression]],
@@ -79,18 +78,23 @@ case class SortBasedAggregate(
         // so return an empty iterator.
         Iterator[InternalRow]()
       } else {
-        val outputIter = SortBasedAggregationIterator.createFromInputIterator(
-          groupingExpressions,
+        val groupingKeyProjection = if (UnsafeProjection.canSupport(groupingExpressions)) {
+          UnsafeProjection.create(groupingExpressions, child.output)
+        } else {
+          newMutableProjection(groupingExpressions, child.output)()
+        }
+        val outputIter = new SortBasedAggregationIterator(
+          groupingKeyProjection,
+          groupingExpressions.map(_.toAttribute),
+          child.output,
+          iter,
           nonCompleteAggregateExpressions,
           nonCompleteAggregateAttributes,
           completeAggregateExpressions,
           completeAggregateAttributes,
           initialInputBufferOffset,
           resultExpressions,
-          newMutableProjection _,
-          newProjection _,
-          child.output,
-          iter,
+          newMutableProjection,
           outputsUnsafeRows,
           numInputRows,
           numOutputRows)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
index a9e5d175bf895..64c673064f576 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/SortBasedAggregationIterator.scala
@@ -21,16 +21,16 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression2, AggregateFunction2}
 import org.apache.spark.sql.execution.metric.LongSQLMetric
-import org.apache.spark.unsafe.KVIterator
 
 /**
  * An iterator used to evaluate [[AggregateFunction2]]. It assumes the input rows have been
  * sorted by values of [[groupingKeyAttributes]].
  */
 class SortBasedAggregationIterator(
+    groupingKeyProjection: InternalRow => InternalRow,
     groupingKeyAttributes: Seq[Attribute],
     valueAttributes: Seq[Attribute],
-    inputKVIterator: KVIterator[InternalRow, InternalRow],
+    inputIterator: Iterator[InternalRow],
     nonCompleteAggregateExpressions: Seq[AggregateExpression2],
     nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
@@ -90,6 +90,22 @@ class SortBasedAggregationIterator(
   // The aggregation buffer used by the sort-based aggregation.
   private[this] val sortBasedAggregationBuffer: MutableRow = newBuffer
 
+  protected def initialize(): Unit = {
+    if (inputIterator.hasNext) {
+      initializeBuffer(sortBasedAggregationBuffer)
+      val inputRow = inputIterator.next()
+      nextGroupingKey = groupingKeyProjection(inputRow).copy()
+      firstRowInNextGroup = inputRow.copy()
+      numInputRows += 1
+      sortedInputHasNewGroup = true
+    } else {
+      // This inputIter is empty.
+      sortedInputHasNewGroup = false
+    }
+  }
+
+  initialize()
+
   /** Processes rows in the current group. It will stop when it find a new group. */
   protected def processCurrentSortedGroup(): Unit = {
     currentGroupingKey = nextGroupingKey
@@ -101,18 +117,15 @@ class SortBasedAggregationIterator(
 
     // The search will stop when we see the next group or there is no
     // input row left in the iter.
-    var hasNext = inputKVIterator.next()
-    while (!findNextPartition && hasNext) {
+    while (!findNextPartition && inputIterator.hasNext) {
       // Get the grouping key.
-      val groupingKey = inputKVIterator.getKey
-      val currentRow = inputKVIterator.getValue
+      val currentRow = inputIterator.next()
+      val groupingKey = groupingKeyProjection(currentRow)
       numInputRows += 1
 
       // Check if the current row belongs the current input row.
       if (currentGroupingKey == groupingKey) {
         processRow(sortBasedAggregationBuffer, currentRow)
-
-        hasNext = inputKVIterator.next()
       } else {
         // We find a new group.
         findNextPartition = true
@@ -149,68 +162,8 @@ class SortBasedAggregationIterator(
     }
   }
 
-  protected def initialize(): Unit = {
-    if (inputKVIterator.next()) {
-      initializeBuffer(sortBasedAggregationBuffer)
-
-      nextGroupingKey = inputKVIterator.getKey().copy()
-      firstRowInNextGroup = inputKVIterator.getValue().copy()
-      numInputRows += 1
-      sortedInputHasNewGroup = true
-    } else {
-      // This inputIter is empty.
-      sortedInputHasNewGroup = false
-    }
-  }
-
-  initialize()
-
   def outputForEmptyGroupingKeyWithoutInput(): InternalRow = {
     initializeBuffer(sortBasedAggregationBuffer)
     generateOutput(new GenericInternalRow(0), sortBasedAggregationBuffer)
   }
 }
-
-object SortBasedAggregationIterator {
-  // scalastyle:off
-  def createFromInputIterator(
-      groupingExprs: Seq[NamedExpression],
-      nonCompleteAggregateExpressions: Seq[AggregateExpression2],
-      nonCompleteAggregateAttributes: Seq[Attribute],
-      completeAggregateExpressions: Seq[AggregateExpression2],
-      completeAggregateAttributes: Seq[Attribute],
-      initialInputBufferOffset: Int,
-      resultExpressions: Seq[NamedExpression],
-      newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
-      newProjection: (Seq[Expression], Seq[Attribute]) => Projection,
-      inputAttributes: Seq[Attribute],
-      inputIter: Iterator[InternalRow],
-      outputsUnsafeRows: Boolean,
-      numInputRows: LongSQLMetric,
-      numOutputRows: LongSQLMetric): SortBasedAggregationIterator = {
-    val kvIterator = if (UnsafeProjection.canSupport(groupingExprs)) {
-      AggregationIterator.unsafeKVIterator(
-        groupingExprs,
-        inputAttributes,
-        inputIter).asInstanceOf[KVIterator[InternalRow, InternalRow]]
-    } else {
-      AggregationIterator.kvIterator(groupingExprs, newProjection, inputAttributes, inputIter)
-    }
-
-    new SortBasedAggregationIterator(
-      groupingExprs.map(_.toAttribute),
-      inputAttributes,
-      kvIterator,
-      nonCompleteAggregateExpressions,
-      nonCompleteAggregateAttributes,
-      completeAggregateExpressions,
-      completeAggregateAttributes,
-      initialInputBufferOffset,
-      resultExpressions,
-      newMutableProjection,
-      outputsUnsafeRows,
-      numInputRows,
-      numOutputRows)
-  }
-  // scalastyle:on
-}

From fcb37a04177edc2376e39dd0b910f0268f7c72ec Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Mon, 12 Oct 2015 09:16:14 -0700
Subject: [PATCH 022/139] [SPARK-10960] [SQL] SQL with windowing function
 should be able to refer column in inner select

JIRA: https://issues.apache.org/jira/browse/SPARK-10960

When accessing a column in inner select from a select with window function, `AnalysisException` will be thrown. For example, an query like this:

     select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1 from (select month, area, product, 1 as tmp1 from windowData) tmp

Currently, the rule `ExtractWindowExpressions` in `Analyzer` only extracts regular expressions from `WindowFunction`, `WindowSpecDefinition` and `AggregateExpression`. We need to also extract other attributes as the one in `Alias` as shown in the above query.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #9011 from viirya/fix-window-inner-column.
---
 .../sql/catalyst/analysis/Analyzer.scala      |  4 +++
 .../sql/hive/execution/SQLQuerySuite.scala    | 27 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index bf72d47ce1ea6..f5597a08d3595 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -831,6 +831,10 @@ class Analyzer(
             val withName = Alias(agg, s"_w${extractedExprBuffer.length}")()
             extractedExprBuffer += withName
             withName.toAttribute
+
+          // Extracts other attributes
+          case attr: Attribute => extractExpr(attr)
+
         }.asInstanceOf[NamedExpression]
       }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index ccc15eaa63f42..51b63f3688783 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -838,6 +838,33 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       ).map(i => Row(i._1, i._2, i._3)))
   }
 
+  test("window function: refer column in inner select block") {
+    val data = Seq(
+      WindowData(1, "a", 5),
+      WindowData(2, "a", 6),
+      WindowData(3, "b", 7),
+      WindowData(4, "b", 8),
+      WindowData(5, "c", 9),
+      WindowData(6, "c", 10)
+    )
+    sparkContext.parallelize(data).toDF().registerTempTable("windowData")
+
+    checkAnswer(
+      sql(
+        """
+          |select area, rank() over (partition by area order by tmp.month) + tmp.tmp1 as c1
+          |from (select month, area, product, 1 as tmp1 from windowData) tmp
+        """.stripMargin),
+      Seq(
+        ("a", 2),
+        ("a", 3),
+        ("b", 2),
+        ("b", 3),
+        ("c", 2),
+        ("c", 3)
+      ).map(i => Row(i._1, i._2)))
+  }
+
   test("window function: partition and order expressions") {
     val data = Seq(
       WindowData(1, "a", 5),

From 64b1d00e1a7c1dc52c08a5e97baf6e7117f1a94f Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 12 Oct 2015 10:17:19 -0700
Subject: [PATCH 023/139] [SPARK-11007] [SQL] Adds dictionary aware Parquet
 decimal converters

For Parquet decimal columns that are encoded using plain-dictionary encoding, we can make the upper level converter aware of the dictionary, so that we can pre-instantiate all the decimals to avoid duplicated instantiation.

Note that plain-dictionary encoding isn't available for `FIXED_LEN_BYTE_ARRAY` for Parquet writer version `PARQUET_1_0`. So currently only decimals written as `INT32` and `INT64` can benefit from this optimization.

Author: Cheng Lian <lian@databricks.com>

Closes #9040 from liancheng/spark-11007.decimal-converter-dict-support.
---
 .../parquet/CatalystRowConverter.scala        |  83 +++++++++++++++---
 .../src/test/resources/dec-in-i32.parquet     | Bin 0 -> 420 bytes
 .../src/test/resources/dec-in-i64.parquet     | Bin 0 -> 437 bytes
 .../datasources/parquet/ParquetIOSuite.scala  |  19 ++++
 .../ParquetProtobufCompatibilitySuite.scala   |  22 ++---
 .../datasources/parquet/ParquetTest.scala     |   5 ++
 6 files changed, 103 insertions(+), 26 deletions(-)
 create mode 100755 sql/core/src/test/resources/dec-in-i32.parquet
 create mode 100755 sql/core/src/test/resources/dec-in-i64.parquet

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
index 247d35363b862..49007e45ecf87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/CatalystRowConverter.scala
@@ -26,7 +26,7 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.parquet.column.Dictionary
 import org.apache.parquet.io.api.{Binary, Converter, GroupConverter, PrimitiveConverter}
 import org.apache.parquet.schema.OriginalType.{INT_32, LIST, UTF8}
-import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{DOUBLE, INT32, INT64, BINARY, FIXED_LEN_BYTE_ARRAY}
 import org.apache.parquet.schema.{GroupType, MessageType, PrimitiveType, Type}
 
 import org.apache.spark.Logging
@@ -222,8 +222,25 @@ private[parquet] class CatalystRowConverter(
             updater.setShort(value.asInstanceOf[ShortType#InternalType])
         }
 
+      // For INT32 backed decimals
+      case t: DecimalType if parquetType.asPrimitiveType().getPrimitiveTypeName == INT32 =>
+        new CatalystIntDictionaryAwareDecimalConverter(t.precision, t.scale, updater)
+
+      // For INT64 backed decimals
+      case t: DecimalType if parquetType.asPrimitiveType().getPrimitiveTypeName == INT64 =>
+        new CatalystLongDictionaryAwareDecimalConverter(t.precision, t.scale, updater)
+
+      // For BINARY and FIXED_LEN_BYTE_ARRAY backed decimals
+      case t: DecimalType
+        if parquetType.asPrimitiveType().getPrimitiveTypeName == FIXED_LEN_BYTE_ARRAY ||
+           parquetType.asPrimitiveType().getPrimitiveTypeName == BINARY =>
+        new CatalystBinaryDictionaryAwareDecimalConverter(t.precision, t.scale, updater)
+
       case t: DecimalType =>
-        new CatalystDecimalConverter(t, updater)
+        throw new RuntimeException(
+          s"Unable to create Parquet converter for decimal type ${t.json} whose Parquet type is " +
+            s"$parquetType.  Parquet DECIMAL type can only be backed by INT32, INT64, " +
+            "FIXED_LEN_BYTE_ARRAY, or BINARY.")
 
       case StringType =>
         new CatalystStringConverter(updater)
@@ -274,9 +291,10 @@ private[parquet] class CatalystRowConverter(
           override def set(value: Any): Unit = updater.set(value.asInstanceOf[InternalRow].copy())
         })
 
-      case _ =>
+      case t =>
         throw new RuntimeException(
-          s"Unable to create Parquet converter for data type ${catalystType.json}")
+          s"Unable to create Parquet converter for data type ${t.json} " +
+            s"whose Parquet type is $parquetType")
     }
   }
 
@@ -314,11 +332,18 @@ private[parquet] class CatalystRowConverter(
   /**
    * Parquet converter for fixed-precision decimals.
    */
-  private final class CatalystDecimalConverter(
-      decimalType: DecimalType,
-      updater: ParentContainerUpdater)
+  private abstract class CatalystDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
     extends CatalystPrimitiveConverter(updater) {
 
+    protected var expandedDictionary: Array[Decimal] = _
+
+    override def hasDictionarySupport: Boolean = true
+
+    override def addValueFromDictionary(dictionaryId: Int): Unit = {
+      updater.set(expandedDictionary(dictionaryId))
+    }
+
     // Converts decimals stored as INT32
     override def addInt(value: Int): Unit = {
       addLong(value: Long)
@@ -326,18 +351,19 @@ private[parquet] class CatalystRowConverter(
 
     // Converts decimals stored as INT64
     override def addLong(value: Long): Unit = {
-      updater.set(Decimal(value, decimalType.precision, decimalType.scale))
+      updater.set(decimalFromLong(value))
     }
 
     // Converts decimals stored as either FIXED_LENGTH_BYTE_ARRAY or BINARY
     override def addBinary(value: Binary): Unit = {
-      updater.set(toDecimal(value))
+      updater.set(decimalFromBinary(value))
     }
 
-    private def toDecimal(value: Binary): Decimal = {
-      val precision = decimalType.precision
-      val scale = decimalType.scale
+    protected def decimalFromLong(value: Long): Decimal = {
+      Decimal(value, precision, scale)
+    }
 
+    protected def decimalFromBinary(value: Binary): Decimal = {
       if (precision <= CatalystSchemaConverter.MAX_PRECISION_FOR_INT64) {
         // Constructs a `Decimal` with an unscaled `Long` value if possible.
         val unscaled = binaryToUnscaledLong(value)
@@ -371,6 +397,39 @@ private[parquet] class CatalystRowConverter(
     }
   }
 
+  private class CatalystIntDictionaryAwareDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
+    extends CatalystDecimalConverter(precision, scale, updater) {
+
+    override def setDictionary(dictionary: Dictionary): Unit = {
+      this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { id =>
+        decimalFromLong(dictionary.decodeToInt(id).toLong)
+      }
+    }
+  }
+
+  private class CatalystLongDictionaryAwareDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
+    extends CatalystDecimalConverter(precision, scale, updater) {
+
+    override def setDictionary(dictionary: Dictionary): Unit = {
+      this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { id =>
+        decimalFromLong(dictionary.decodeToLong(id))
+      }
+    }
+  }
+
+  private class CatalystBinaryDictionaryAwareDecimalConverter(
+      precision: Int, scale: Int, updater: ParentContainerUpdater)
+    extends CatalystDecimalConverter(precision, scale, updater) {
+
+    override def setDictionary(dictionary: Dictionary): Unit = {
+      this.expandedDictionary = Array.tabulate(dictionary.getMaxId + 1) { id =>
+        decimalFromBinary(dictionary.decodeToBinary(id))
+      }
+    }
+  }
+
   /**
    * Parquet converter for arrays.  Spark SQL arrays are represented as Parquet lists.  Standard
    * Parquet lists are represented as a 3-level group annotated by `LIST`:
diff --git a/sql/core/src/test/resources/dec-in-i32.parquet b/sql/core/src/test/resources/dec-in-i32.parquet
new file mode 100755
index 0000000000000000000000000000000000000000..bb5d4af8dd36817bfb2cc16746417f0f2cbec759
GIT binary patch
literal 420
zcmWG=3^EjD5e*Pc^AQyhWno~D@8)2DfaHXP1P{hX!VYT=pEzK^*s-6XkVTmJu$)3z
zLRvxwV-mx>L&=vkfNDh<L={9`bbthlD4QsUj08&yGXsMJ&@cuDF%W@dW>P{zKtf8Q
zr~#MeY(^CZr+?eF2>?}yGD+%q@Dvv$7G=j5CugMQCW<lv1yz|O*fWid;!{$SRk?ts
zb1{f1NXkgcsBy>ub(penut~xdh_Z+&h@D{+YhzO5u)%NwNJfD{Qbs~EzbIWVu^<s>
zi5}QKz2d?gJ)p&frKu%)Mfv4=xv3?IDTyVC63Nv{C6xuKN>)n6B}JvlB}zI<X_=`x
zDaA@w(bY<MiMb#tsPlkwP_;m}X67d5Xqf64X#z#_N^^1&lX8Gcfo7!YD8WouvZ}7F
ljjd&nkbv5)n_Hw%mReMtnV+X%sAr~Uz#z)Vzz_h89{|hbXDk2!

literal 0
HcmV?d00001

diff --git a/sql/core/src/test/resources/dec-in-i64.parquet b/sql/core/src/test/resources/dec-in-i64.parquet
new file mode 100755
index 0000000000000000000000000000000000000000..e07c4a0ad9843dca983e17d784566aae7b4be54e
GIT binary patch
literal 437
zcmWG=3^EjD5naG2n&KlWBFe(RAm7cw00GGf4Gkh1wM<^G4V+$Z2K?fl(wES5p?bj<
zCgYa8#!C!QD$gec0F{a|h$@J>=l}^8Q8rNy83~RSW{3$AFryg6KmtfcCnY2VB%~yY
z8gOaOW>jHt`nPSH0LUmNNgWTK;)2AY?D*p3jMUsjQ6>ga7F8w*_DnOA_>|OSRW6_{
zA`D^*k}{GqY8*16ERv=y9Bh(s1)?ls3S#S+#HKN+aoFH=3P^<lgQSdvW`0q+USdHa
z&@w%+y?VukIeI_`6qcrz=oRIc>*c1FB&H;mBub=IE0t6hq$*h{6_*s1CYLDbD5Yhl
z=A;xWSw&YX<t65Vq@d0O%0blv-JF@5n4@86pkt&76wWKn$w^Gg0jdQWlB%NwGhWH6
ny0$j9mO(-SYPoK1kwRH&QE_H|o`RvCnVtcI93ulm05HM;Rw!wM

literal 0
HcmV?d00001

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index 599cf948e76a0..72744799897be 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -488,6 +488,25 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
       clonedConf.asScala.foreach(entry => hadoopConfiguration.set(entry.getKey, entry.getValue))
     }
   }
+
+  test("read dictionary encoded decimals written as INT32") {
+    checkAnswer(
+      // Decimal column in this file is encoded using plain dictionary
+      readResourceParquetFile("dec-in-i32.parquet"),
+      sqlContext.range(1 << 4).select('id % 10 cast DecimalType(5, 2) as 'i32_dec))
+  }
+
+  test("read dictionary encoded decimals written as INT64") {
+    checkAnswer(
+      // Decimal column in this file is encoded using plain dictionary
+      readResourceParquetFile("dec-in-i64.parquet"),
+      sqlContext.range(1 << 4).select('id % 10 cast DecimalType(10, 2) as 'i64_dec))
+  }
+
+  // TODO Adds test case for reading dictionary encoded decimals written as `FIXED_LEN_BYTE_ARRAY`
+  // The Parquet writer version Spark 1.6 and prior versions use is `PARQUET_1_0`, which doesn't
+  // provide dictionary encoding support for `FIXED_LEN_BYTE_ARRAY`.  Should add a test here once
+  // we upgrade to `PARQUET_2_0`.
 }
 
 class JobCommitFailureParquetOutputCommitter(outputPath: Path, context: TaskAttemptContext)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala
index b290429c2a021..98333e58cada8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetProtobufCompatibilitySuite.scala
@@ -17,23 +17,17 @@
 
 package org.apache.spark.sql.execution.datasources.parquet
 
-import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.test.SharedSQLContext
 
 class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with SharedSQLContext {
-
-  private def readParquetProtobufFile(name: String): DataFrame = {
-    val url = Thread.currentThread().getContextClassLoader.getResource(name)
-    sqlContext.read.parquet(url.toString)
-  }
-
   test("unannotated array of primitive type") {
-    checkAnswer(readParquetProtobufFile("old-repeated-int.parquet"), Row(Seq(1, 2, 3)))
+    checkAnswer(readResourceParquetFile("old-repeated-int.parquet"), Row(Seq(1, 2, 3)))
   }
 
   test("unannotated array of struct") {
     checkAnswer(
-      readParquetProtobufFile("old-repeated-message.parquet"),
+      readResourceParquetFile("old-repeated-message.parquet"),
       Row(
         Seq(
           Row("First inner", null, null),
@@ -41,14 +35,14 @@ class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with Sh
           Row(null, null, "Third inner"))))
 
     checkAnswer(
-      readParquetProtobufFile("proto-repeated-struct.parquet"),
+      readResourceParquetFile("proto-repeated-struct.parquet"),
       Row(
         Seq(
           Row("0 - 1", "0 - 2", "0 - 3"),
           Row("1 - 1", "1 - 2", "1 - 3"))))
 
     checkAnswer(
-      readParquetProtobufFile("proto-struct-with-array-many.parquet"),
+      readResourceParquetFile("proto-struct-with-array-many.parquet"),
       Seq(
         Row(
           Seq(
@@ -66,13 +60,13 @@ class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with Sh
 
   test("struct with unannotated array") {
     checkAnswer(
-      readParquetProtobufFile("proto-struct-with-array.parquet"),
+      readResourceParquetFile("proto-struct-with-array.parquet"),
       Row(10, 9, Seq.empty, null, Row(9), Seq(Row(9), Row(10))))
   }
 
   test("unannotated array of struct with unannotated array") {
     checkAnswer(
-      readParquetProtobufFile("nested-array-struct.parquet"),
+      readResourceParquetFile("nested-array-struct.parquet"),
       Seq(
         Row(2, Seq(Row(1, Seq(Row(3))))),
         Row(5, Seq(Row(4, Seq(Row(6))))),
@@ -81,7 +75,7 @@ class ParquetProtobufCompatibilitySuite extends ParquetCompatibilityTest with Sh
 
   test("unannotated array of string") {
     checkAnswer(
-      readParquetProtobufFile("proto-repeated-string.parquet"),
+      readResourceParquetFile("proto-repeated-string.parquet"),
       Seq(
         Row(Seq("hello", "world")),
         Row(Seq("good", "bye")),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
index 9840ad919e510..8ffb01fc5b584 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetTest.scala
@@ -139,4 +139,9 @@ private[sql] trait ParquetTest extends SQLTestUtils {
       withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> "true") { f }
     }
   }
+
+  protected def readResourceParquetFile(name: String): DataFrame = {
+    val url = Thread.currentThread().getContextClassLoader.getResource(name)
+    sqlContext.read.parquet(url.toString)
+  }
 }

From 149472a01d12828c64b0a852982d48c123984182 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 12 Oct 2015 10:21:57 -0700
Subject: [PATCH 024/139] [SPARK-11023] [YARN] Avoid creating URIs from local
 paths directly.

The issue is that local paths on Windows, when provided with drive
letters or backslashes, are not valid URIs.

Instead of trying to figure out whether paths are URIs or not, use
Utils.resolveURI() which does that for us.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9049 from vanzin/SPARK-11023 and squashes the following commits:

77021f2 [Marcelo Vanzin] [SPARK-11023] [yarn] Avoid creating URIs from local paths directly.
---
 .../scala/org/apache/spark/deploy/yarn/Client.scala   | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index cec81b940644c..1fbd18aa466d4 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -358,7 +358,8 @@ private[spark] class Client(
         destName: Option[String] = None,
         targetDir: Option[String] = None,
         appMasterOnly: Boolean = false): (Boolean, String) = {
-      val localURI = new URI(path.trim())
+      val trimmedPath = path.trim()
+      val localURI = Utils.resolveURI(trimmedPath)
       if (localURI.getScheme != LOCAL_SCHEME) {
         if (addDistributedUri(localURI)) {
           val localPath = getQualifiedLocalPath(localURI, hadoopConf)
@@ -374,7 +375,7 @@ private[spark] class Client(
           (false, null)
         }
       } else {
-        (true, path.trim())
+        (true, trimmedPath)
       }
     }
 
@@ -595,10 +596,10 @@ private[spark] class Client(
         LOCALIZED_PYTHON_DIR)
     }
     (pySparkArchives ++ pyArchives).foreach { path =>
-      val uri = new URI(path)
+      val uri = Utils.resolveURI(path)
       if (uri.getScheme != LOCAL_SCHEME) {
         pythonPath += buildPath(YarnSparkHadoopUtil.expandEnvironment(Environment.PWD),
-          new Path(path).getName())
+          new Path(uri).getName())
       } else {
         pythonPath += uri.getPath()
       }
@@ -1229,7 +1230,7 @@ object Client extends Logging {
 
   private def getMainJarUri(mainJar: Option[String]): Option[URI] = {
     mainJar.flatMap { path =>
-      val uri = new URI(path)
+      val uri = Utils.resolveURI(path)
       if (uri.getScheme == LOCAL_SCHEME) Some(uri) else None
     }.orElse(Some(new URI(APP_JAR)))
   }

From 2e572c4135c3f5ad3061c1f58cdb8a70bed0a9d3 Mon Sep 17 00:00:00 2001
From: Ashwin Shankar <ashankar@netflix.com>
Date: Mon, 12 Oct 2015 11:06:21 -0700
Subject: [PATCH 025/139] [SPARK-8170] [PYTHON] Add signal handler to trap
 Ctrl-C in pyspark and cancel all running jobs

This patch adds a signal handler to trap Ctrl-C and cancels running job.

Author: Ashwin Shankar <ashankar@netflix.com>

Closes #9033 from ashwinshankar77/master.
---
 python/pyspark/context.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index a0a1ccbeefb09..4969d85f52b23 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -19,6 +19,7 @@
 
 import os
 import shutil
+import signal
 import sys
 from threading import Lock
 from tempfile import NamedTemporaryFile
@@ -217,6 +218,12 @@ def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize,
         else:
             self.profiler_collector = None
 
+        # create a signal handler which would be invoked on receiving SIGINT
+        def signal_handler(signal, frame):
+            self.cancelAllJobs()
+
+        signal.signal(signal.SIGINT, signal_handler)
+
     def _initialize_context(self, jconf):
         """
         Initialize SparkContext in function to allow subclass specific initialization

From 8a354bef55ce9cc0fa77fa1c3a9d62c16438ca1b Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Mon, 12 Oct 2015 13:50:34 -0700
Subject: [PATCH 026/139] [SPARK-11042] [SQL] Add a mechanism to ban creating
 multiple root SQLContexts/HiveContexts in a JVM

https://issues.apache.org/jira/browse/SPARK-11042

Author: Yin Huai <yhuai@databricks.com>

Closes #9058 from yhuai/SPARK-11042.
---
 .../scala/org/apache/spark/sql/SQLConf.scala  | 10 ++
 .../org/apache/spark/sql/SQLContext.scala     | 42 +++++++-
 .../spark/sql/MultiSQLContextsSuite.scala     | 99 +++++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   | 12 ++-
 4 files changed, 156 insertions(+), 7 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index 47397c4be3cb6..f62df9bdebcc0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -186,6 +186,16 @@ private[spark] object SQLConf {
 
   import SQLConfEntry._
 
+  val ALLOW_MULTIPLE_CONTEXTS = booleanConf("spark.sql.allowMultipleContexts",
+    defaultValue = Some(true),
+    doc = "When set to true, creating multiple SQLContexts/HiveContexts is allowed." +
+      "When set to false, only one SQLContext/HiveContext is allowed to be created " +
+      "through the constructor (new SQLContexts/HiveContexts created through newSession " +
+      "method is allowed). Please note that this conf needs to be set in Spark Conf. Once" +
+      "a SQLContext/HiveContext has been created, changing the value of this conf will not" +
+      "have effect.",
+    isPublic = true)
+
   val COMPRESS_CACHED = booleanConf("spark.sql.inMemoryColumnarStorage.compressed",
     defaultValue = Some(true),
     doc = "When set to true Spark SQL will automatically select a compression codec for each " +
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 2bdfd82af0adb..1bd291389241a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -26,7 +26,7 @@ import scala.collection.immutable
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
 
-import org.apache.spark.SparkContext
+import org.apache.spark.{SparkException, SparkContext}
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.rdd.RDD
@@ -64,14 +64,37 @@ import org.apache.spark.util.Utils
  */
 class SQLContext private[sql](
     @transient val sparkContext: SparkContext,
-    @transient protected[sql] val cacheManager: CacheManager)
+    @transient protected[sql] val cacheManager: CacheManager,
+    val isRootContext: Boolean)
   extends org.apache.spark.Logging with Serializable {
 
   self =>
 
-  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager)
+  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager, true)
   def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
 
+  // If spark.sql.allowMultipleContexts is true, we will throw an exception if a user
+  // wants to create a new root SQLContext (a SLQContext that is not created by newSession).
+  private val allowMultipleContexts =
+    sparkContext.conf.getBoolean(
+      SQLConf.ALLOW_MULTIPLE_CONTEXTS.key,
+      SQLConf.ALLOW_MULTIPLE_CONTEXTS.defaultValue.get)
+
+  // Assert no root SQLContext is running when allowMultipleContexts is false.
+  {
+    if (!allowMultipleContexts && isRootContext) {
+      SQLContext.getInstantiatedContextOption() match {
+        case Some(rootSQLContext) =>
+          val errMsg = "Only one SQLContext/HiveContext may be running in this JVM. " +
+            s"It is recommended to use SQLContext.getOrCreate to get the instantiated " +
+            s"SQLContext/HiveContext. To ignore this error, " +
+            s"set ${SQLConf.ALLOW_MULTIPLE_CONTEXTS.key} = true in SparkConf."
+          throw new SparkException(errMsg)
+        case None => // OK
+      }
+    }
+  }
+
   /**
    * Returns a SQLContext as new session, with separated SQL configurations, temporary tables,
    * registered functions, but sharing the same SparkContext and CacheManager.
@@ -79,7 +102,10 @@ class SQLContext private[sql](
    * @since 1.6.0
    */
   def newSession(): SQLContext = {
-    new SQLContext(sparkContext, cacheManager)
+    new SQLContext(
+      sparkContext = sparkContext,
+      cacheManager = cacheManager,
+      isRootContext = false)
   }
 
   /**
@@ -1239,6 +1265,10 @@ object SQLContext {
     instantiatedContext.compareAndSet(null, sqlContext)
   }
 
+  private[sql] def getInstantiatedContextOption(): Option[SQLContext] = {
+    Option(instantiatedContext.get())
+  }
+
   /**
    * Changes the SQLContext that will be returned in this thread and its children when
    * SQLContext.getOrCreate() is called. This can be used to ensure that a given thread receives
@@ -1260,6 +1290,10 @@ object SQLContext {
     activeContext.remove()
   }
 
+  private[sql] def getActiveContextOption(): Option[SQLContext] = {
+    Option(activeContext.get())
+  }
+
   /**
    * Converts an iterator of Java Beans to InternalRow using the provided
    * bean info & schema. This is not related to the singleton, but is a static
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala
new file mode 100644
index 0000000000000..0e8fcb6a858b1
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/MultiSQLContextsSuite.scala
@@ -0,0 +1,99 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql
+
+import org.apache.spark._
+import org.scalatest.BeforeAndAfterAll
+
+class MultiSQLContextsSuite extends SparkFunSuite with BeforeAndAfterAll {
+
+  private var originalActiveSQLContext: Option[SQLContext] = _
+  private var originalInstantiatedSQLContext: Option[SQLContext] = _
+  private var sparkConf: SparkConf = _
+
+  override protected def beforeAll(): Unit = {
+    originalActiveSQLContext = SQLContext.getActiveContextOption()
+    originalInstantiatedSQLContext = SQLContext.getInstantiatedContextOption()
+
+    SQLContext.clearActive()
+    originalInstantiatedSQLContext.foreach(ctx => SQLContext.clearInstantiatedContext(ctx))
+    sparkConf =
+      new SparkConf(false)
+        .setMaster("local[*]")
+        .setAppName("test")
+        .set("spark.ui.enabled", "false")
+        .set("spark.driver.allowMultipleContexts", "true")
+  }
+
+  override protected def afterAll(): Unit = {
+    // Set these states back.
+    originalActiveSQLContext.foreach(ctx => SQLContext.setActive(ctx))
+    originalInstantiatedSQLContext.foreach(ctx => SQLContext.setInstantiatedContext(ctx))
+  }
+
+  def testNewSession(rootSQLContext: SQLContext): Unit = {
+    // Make sure we can successfully create new Session.
+    rootSQLContext.newSession()
+
+    // Reset the state. It is always safe to clear the active context.
+    SQLContext.clearActive()
+  }
+
+  def testCreatingNewSQLContext(allowsMultipleContexts: Boolean): Unit = {
+    val conf =
+      sparkConf
+        .clone
+        .set(SQLConf.ALLOW_MULTIPLE_CONTEXTS.key, allowsMultipleContexts.toString)
+    val sparkContext = new SparkContext(conf)
+
+    try {
+      if (allowsMultipleContexts) {
+        new SQLContext(sparkContext)
+        SQLContext.clearActive()
+      } else {
+        // If allowsMultipleContexts is false, make sure we can get the error.
+        val message = intercept[SparkException] {
+          new SQLContext(sparkContext)
+        }.getMessage
+        assert(message.contains("Only one SQLContext/HiveContext may be running"))
+      }
+    } finally {
+      sparkContext.stop()
+    }
+  }
+
+  test("test the flag to disallow creating multiple root SQLContext") {
+    Seq(false, true).foreach { allowMultipleSQLContexts =>
+      val conf =
+        sparkConf
+          .clone
+          .set(SQLConf.ALLOW_MULTIPLE_CONTEXTS.key, allowMultipleSQLContexts.toString)
+      val sc = new SparkContext(conf)
+      try {
+        val rootSQLContext = new SQLContext(sc)
+        testNewSession(rootSQLContext)
+        testNewSession(rootSQLContext)
+        testCreatingNewSQLContext(allowMultipleSQLContexts)
+
+        SQLContext.clearInstantiatedContext(rootSQLContext)
+      } finally {
+        sc.stop()
+      }
+    }
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index dad1e2347c387..ddeadd3eb737d 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -89,10 +89,11 @@ class HiveContext private[hive](
     sc: SparkContext,
     cacheManager: CacheManager,
     @transient execHive: ClientWrapper,
-    @transient metaHive: ClientInterface) extends SQLContext(sc, cacheManager) with Logging {
+    @transient metaHive: ClientInterface,
+    isRootContext: Boolean) extends SQLContext(sc, cacheManager, isRootContext) with Logging {
   self =>
 
-  def this(sc: SparkContext) = this(sc, new CacheManager, null, null)
+  def this(sc: SparkContext) = this(sc, new CacheManager, null, null, true)
   def this(sc: JavaSparkContext) = this(sc.sc)
 
   import org.apache.spark.sql.hive.HiveContext._
@@ -105,7 +106,12 @@ class HiveContext private[hive](
    * and Hive client (both of execution and metadata) with existing HiveContext.
    */
   override def newSession(): HiveContext = {
-    new HiveContext(sc, cacheManager, executionHive.newSession(), metadataHive.newSession())
+    new HiveContext(
+      sc = sc,
+      cacheManager = cacheManager,
+      execHive = executionHive.newSession(),
+      metaHive = metadataHive.newSession(),
+      isRootContext = false)
   }
 
   /**

From 091c2c3ecd69803d78c2b15a1487046701059d38 Mon Sep 17 00:00:00 2001
From: Kay Ousterhout <kayousterhout@gmail.com>
Date: Mon, 12 Oct 2015 14:23:29 -0700
Subject: [PATCH 027/139] [SPARK-11056] Improve documentation of SBT build.

This commit improves the documentation around building Spark to
(1) recommend using SBT interactive mode to avoid the overhead of
launching SBT and (2) refer to the wiki page that documents using
SPARK_PREPEND_CLASSES to avoid creating the assembly jar for each
compile.

cc srowen

Author: Kay Ousterhout <kayousterhout@gmail.com>

Closes #9068 from kayousterhout/SPARK-11056.
---
 docs/building-spark.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/building-spark.md b/docs/building-spark.md
index 4d929ee10a33f..743643cbcc62f 100644
--- a/docs/building-spark.md
+++ b/docs/building-spark.md
@@ -216,6 +216,11 @@ can be set to control the SBT build. For example:
 
     build/sbt -Pyarn -Phadoop-2.3 assembly
 
+To avoid the overhead of launching sbt each time you need to re-compile, you can launch sbt
+in interactive mode by running `build/sbt`, and then run all build commands at the command
+prompt. For more recommendations on reducing build time, refer to the
+[wiki page](https://cwiki.apache.org/confluence/display/SPARK/Useful+Developer+Tools#UsefulDeveloperTools-ReducingBuildTimes).
+
 # Testing with SBT
 
 Some of the tests require Spark to be packaged first, so always run `build/sbt assembly` the first time.  The following is an example of a correct (build, test) sequence:

From f97e9323b526b3d0b0fee0ca03f4276f37bb5750 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Mon, 12 Oct 2015 18:17:28 -0700
Subject: [PATCH 028/139] [SPARK-10739] [YARN] Add application attempt window
 for Spark on Yarn

Add application attempt window for Spark on Yarn to ignore old out of window failures, this is useful for long running applications to recover from failures.

Author: jerryshao <sshao@hortonworks.com>

Closes #8857 from jerryshao/SPARK-10739 and squashes the following commits:

36eabdc [jerryshao] change the doc
7f9b77d [jerryshao] Style change
1c9afd0 [jerryshao] Address the comments
caca695 [jerryshao] Add application attempt window for Spark on Yarn
---
 docs/running-on-yarn.md                            |  9 +++++++++
 .../org/apache/spark/deploy/yarn/Client.scala      | 14 ++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 6d77db6a3271e..677c0000440ac 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -305,6 +305,15 @@ If you need a reference to the proper location to put log files in the YARN so t
   It should be no larger than the global number of max attempts in the YARN configuration.
   </td>
 </tr>
+<tr>
+  <td><code>spark.yarn.am.attemptFailuresValidityInterval</code></td>
+  <td>(none)</td>
+  <td>
+  Defines the validity interval for AM failure tracking.
+  If the AM has been running for at least the defined interval, the AM failure count will be reset.
+  This feature is not enabled if not configured, and only supported in Hadoop 2.6+.
+  </td>
+</tr>
 <tr>
   <td><code>spark.yarn.submit.waitAppCompletion</code></td>
   <td><code>true</code></td>
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 1fbd18aa466d4..d25d830fd4349 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -208,6 +208,20 @@ private[spark] class Client(
       case None => logDebug("spark.yarn.maxAppAttempts is not set. " +
           "Cluster's default value will be used.")
     }
+
+    if (sparkConf.contains("spark.yarn.am.attemptFailuresValidityInterval")) {
+      try {
+        val interval = sparkConf.getTimeAsMs("spark.yarn.am.attemptFailuresValidityInterval")
+        val method = appContext.getClass().getMethod(
+          "setAttemptFailuresValidityInterval", classOf[Long])
+        method.invoke(appContext, interval: java.lang.Long)
+      } catch {
+        case e: NoSuchMethodException =>
+          logWarning("Ignoring spark.yarn.am.attemptFailuresValidityInterval because the version " +
+            "of YARN does not support it")
+      }
+    }
+
     val capability = Records.newRecord(classOf[Resource])
     capability.setMemory(args.amMemory + amMemoryOverhead)
     capability.setVirtualCores(args.amCores)

From c4da5345a0ef643a7518756caaa18ff3f3ea9acc Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 12 Oct 2015 21:12:59 -0700
Subject: [PATCH 029/139] [SPARK-10990] [SPARK-11018] [SQL] improve unrolling
 of complex types

This PR improve the unrolling and read of complex types in columnar cache:
1) Using UnsafeProjection to do serialization of complex types, so they will not be serialized three times (two for actualSize)
2) Copy the bytes from UnsafeRow/UnsafeArrayData to ByteBuffer directly, avoiding the immediate byte[]
3) Using the underlying array in ByteBuffer to create UTF8String/UnsafeRow/UnsafeArrayData without copy.

Combine these optimizations,  we can reduce the unrolling time from 25s to 21s (20% less), reduce the scanning time from 3.5s to 2.5s (28% less).

```
df = sqlContext.read.parquet(path)
t = time.time()
df.cache()
df.count()
print 'unrolling', time.time() - t

for i in range(10):
    t = time.time()
    print df.select("*")._jdf.queryExecution().toRdd().count()
    print time.time() - t
```

The schema is
```
root
 |-- a: struct (nullable = true)
 |    |-- b: long (nullable = true)
 |    |-- c: string (nullable = true)
 |-- d: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- e: map (nullable = true)
 |    |-- key: long
 |    |-- value: string (valueContainsNull = true)
```

Now the columnar cache depends on that UnsafeProjection support all the data types (including UDT), this PR also fix that.

Author: Davies Liu <davies@databricks.com>

Closes #9016 from davies/complex2.
---
 .../catalyst/expressions/UnsafeArrayData.java |  12 ++
 .../sql/catalyst/expressions/UnsafeRow.java   |  12 ++
 .../expressions/codegen/CodeGenerator.scala   |   5 +
 .../codegen/GenerateSafeProjection.scala      |   1 +
 .../codegen/GenerateUnsafeProjection.scala    |  29 ++-
 .../spark/sql/columnar/ColumnAccessor.scala   |   9 +-
 .../spark/sql/columnar/ColumnType.scala       | 187 +++++++++---------
 .../columnar/InMemoryColumnarTableScan.scala  |   6 +-
 .../spark/sql/columnar/ColumnTypeSuite.scala  |  37 ++--
 .../NullableColumnAccessorSuite.scala         |   7 +-
 .../columnar/NullableColumnBuilderSuite.scala |  13 +-
 .../apache/spark/unsafe/types/UTF8String.java |  10 +
 12 files changed, 188 insertions(+), 140 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index fdd9125613a26..796f8abec9a1d 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -19,6 +19,7 @@
 
 import java.math.BigDecimal;
 import java.math.BigInteger;
+import java.nio.ByteBuffer;
 
 import org.apache.spark.sql.types.*;
 import org.apache.spark.unsafe.Platform;
@@ -145,6 +146,8 @@ public Object get(int ordinal, DataType dataType) {
       return getArray(ordinal);
     } else if (dataType instanceof MapType) {
       return getMap(ordinal);
+    } else if (dataType instanceof UserDefinedType) {
+      return get(ordinal, ((UserDefinedType)dataType).sqlType());
     } else {
       throw new UnsupportedOperationException("Unsupported data type " + dataType.simpleString());
     }
@@ -306,6 +309,15 @@ public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
   }
 
+  public void writeTo(ByteBuffer buffer) {
+    assert(buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + sizeInBytes);
+  }
+
   @Override
   public UnsafeArrayData copy() {
     UnsafeArrayData arrayCopy = new UnsafeArrayData();
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 5af7ed5d6eb6d..36859fbab9744 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -20,6 +20,7 @@
 import java.io.*;
 import java.math.BigDecimal;
 import java.math.BigInteger;
+import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
@@ -326,6 +327,8 @@ public Object get(int ordinal, DataType dataType) {
       return getArray(ordinal);
     } else if (dataType instanceof MapType) {
       return getMap(ordinal);
+    } else if (dataType instanceof UserDefinedType) {
+      return get(ordinal, ((UserDefinedType)dataType).sqlType());
     } else {
       throw new UnsupportedOperationException("Unsupported data type " + dataType.simpleString());
     }
@@ -602,6 +605,15 @@ public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
   }
 
+  public void writeTo(ByteBuffer buffer) {
+    assert (buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + sizeInBytes);
+  }
+
   @Override
   public void writeExternal(ObjectOutput out) throws IOException {
     byte[] bytes = getBytes();
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index a0fe5bd77e3aa..7544d27e3dc15 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -129,6 +129,7 @@ class CodeGenContext {
       case _: ArrayType => s"$input.getArray($ordinal)"
       case _: MapType => s"$input.getMap($ordinal)"
       case NullType => "null"
+      case udt: UserDefinedType[_] => getValue(input, udt.sqlType, ordinal)
       case _ => s"($jt)$input.get($ordinal, null)"
     }
   }
@@ -143,6 +144,7 @@ class CodeGenContext {
       case t: DecimalType => s"$row.setDecimal($ordinal, $value, ${t.precision})"
       // The UTF8String may came from UnsafeRow, otherwise clone is cheap (re-use the bytes)
       case StringType => s"$row.update($ordinal, $value.clone())"
+      case udt: UserDefinedType[_] => setColumn(row, udt.sqlType, ordinal, value)
       case _ => s"$row.update($ordinal, $value)"
     }
   }
@@ -177,6 +179,7 @@ class CodeGenContext {
     case _: MapType => "MapData"
     case dt: OpenHashSetUDT if dt.elementType == IntegerType => classOf[IntegerHashSet].getName
     case dt: OpenHashSetUDT if dt.elementType == LongType => classOf[LongHashSet].getName
+    case udt: UserDefinedType[_] => javaType(udt.sqlType)
     case ObjectType(cls) if cls.isArray => s"${javaType(ObjectType(cls.getComponentType))}[]"
     case ObjectType(cls) => cls.getName
     case _ => "Object"
@@ -222,6 +225,7 @@ class CodeGenContext {
     case FloatType => s"(java.lang.Float.isNaN($c1) && java.lang.Float.isNaN($c2)) || $c1 == $c2"
     case DoubleType => s"(java.lang.Double.isNaN($c1) && java.lang.Double.isNaN($c2)) || $c1 == $c2"
     case dt: DataType if isPrimitiveType(dt) => s"$c1 == $c2"
+    case udt: UserDefinedType[_] => genEqual(udt.sqlType, c1, c2)
     case other => s"$c1.equals($c2)"
   }
 
@@ -255,6 +259,7 @@ class CodeGenContext {
       addNewFunction(compareFunc, funcCode)
       s"this.$compareFunc($c1, $c2)"
     case other if other.isInstanceOf[AtomicType] => s"$c1.compare($c2)"
+    case udt: UserDefinedType[_] => genComp(udt.sqlType, c1, c2)
     case _ =>
       throw new IllegalArgumentException("cannot generate compare code for un-comparable type")
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
index 9873630937d31..ee50587ed097e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateSafeProjection.scala
@@ -124,6 +124,7 @@ object GenerateSafeProjection extends CodeGenerator[Seq[Expression], Projection]
     case MapType(keyType, valueType, _) => createCodeForMap(ctx, input, keyType, valueType)
     // UTF8String act as a pointer if it's inside UnsafeRow, so copy it to make it safe.
     case StringType => GeneratedExpressionCode("", "false", s"$input.clone()")
+    case udt: UserDefinedType[_] => convertToSafe(ctx, input, udt.sqlType)
     case _ => GeneratedExpressionCode("", "false", input)
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 3e0e81733fb1f..1b957a508d10e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -39,6 +39,8 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     case t: StructType => t.toSeq.forall(field => canSupport(field.dataType))
     case t: ArrayType if canSupport(t.elementType) => true
     case MapType(kt, vt, _) if canSupport(kt) && canSupport(vt) => true
+    case dt: OpenHashSetUDT => false  // it's not a standard UDT
+    case udt: UserDefinedType[_] => canSupport(udt.sqlType)
     case _ => false
   }
 
@@ -77,7 +79,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     ctx.addMutableState(rowWriterClass, rowWriter, s"this.$rowWriter = new $rowWriterClass();")
 
     val writeFields = inputs.zip(inputTypes).zipWithIndex.map {
-      case ((input, dt), index) =>
+      case ((input, dataType), index) =>
+        val dt = dataType match {
+          case udt: UserDefinedType[_] => udt.sqlType
+          case other => other
+        }
         val tmpCursor = ctx.freshName("tmpCursor")
 
         val setNull = dt match {
@@ -167,15 +173,20 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
     val index = ctx.freshName("index")
     val element = ctx.freshName("element")
 
-    val jt = ctx.javaType(elementType)
+    val et = elementType match {
+      case udt: UserDefinedType[_] => udt.sqlType
+      case other => other
+    }
+
+    val jt = ctx.javaType(et)
 
-    val fixedElementSize = elementType match {
+    val fixedElementSize = et match {
       case t: DecimalType if t.precision <= Decimal.MAX_LONG_DIGITS => 8
-      case _ if ctx.isPrimitiveType(jt) => elementType.defaultSize
+      case _ if ctx.isPrimitiveType(jt) => et.defaultSize
       case _ => 0
     }
 
-    val writeElement = elementType match {
+    val writeElement = et match {
       case t: StructType =>
         s"""
           $arrayWriter.setOffset($index);
@@ -194,13 +205,13 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           ${writeMapToBuffer(ctx, element, kt, vt, bufferHolder)}
         """
 
-      case _ if ctx.isPrimitiveType(elementType) =>
+      case _ if ctx.isPrimitiveType(et) =>
         // Should we do word align?
-        val dataSize = elementType.defaultSize
+        val dataSize = et.defaultSize
 
         s"""
           $arrayWriter.setOffset($index);
-          ${writePrimitiveType(ctx, element, elementType,
+          ${writePrimitiveType(ctx, element, et,
             s"$bufferHolder.buffer", s"$bufferHolder.cursor")}
           $bufferHolder.cursor += $dataSize;
         """
@@ -237,7 +248,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
           if ($input.isNullAt($index)) {
             $arrayWriter.setNullAt($index);
           } else {
-            final $jt $element = ${ctx.getValue(input, elementType, index)};
+            final $jt $element = ${ctx.getValue(input, et, index)};
             $writeElement
           }
         }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
index 62478667eb4fb..42ec4d3433f16 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnAccessor.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.columnar
 
 import java.nio.{ByteBuffer, ByteOrder}
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.MutableRow
+import org.apache.spark.sql.catalyst.expressions.{MutableRow, UnsafeArrayData, UnsafeMapData, UnsafeRow}
 import org.apache.spark.sql.columnar.compression.CompressibleColumnAccessor
 import org.apache.spark.sql.types._
 
@@ -109,15 +108,15 @@ private[sql] class DecimalColumnAccessor(buffer: ByteBuffer, dataType: DecimalTy
   with NullableColumnAccessor
 
 private[sql] class StructColumnAccessor(buffer: ByteBuffer, dataType: StructType)
-  extends BasicColumnAccessor[InternalRow](buffer, STRUCT(dataType))
+  extends BasicColumnAccessor[UnsafeRow](buffer, STRUCT(dataType))
   with NullableColumnAccessor
 
 private[sql] class ArrayColumnAccessor(buffer: ByteBuffer, dataType: ArrayType)
-  extends BasicColumnAccessor[ArrayData](buffer, ARRAY(dataType))
+  extends BasicColumnAccessor[UnsafeArrayData](buffer, ARRAY(dataType))
   with NullableColumnAccessor
 
 private[sql] class MapColumnAccessor(buffer: ByteBuffer, dataType: MapType)
-  extends BasicColumnAccessor[MapData](buffer, MAP(dataType))
+  extends BasicColumnAccessor[UnsafeMapData](buffer, MAP(dataType))
   with NullableColumnAccessor
 
 private[sql] object ColumnAccessor {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 3563eacb3a3e9..2bc2c96b61634 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.columnar
 
 import java.math.{BigDecimal, BigInteger}
-import java.nio.{ByteOrder, ByteBuffer}
+import java.nio.ByteBuffer
 
 import scala.reflect.runtime.universe.TypeTag
 
@@ -92,7 +92,7 @@ private[sql] sealed abstract class ColumnType[JvmType] {
    * boxing/unboxing costs whenever possible.
    */
   def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int): Unit = {
-    to.update(toOrdinal, from.get(fromOrdinal, dataType))
+    setField(to, toOrdinal, getField(from, fromOrdinal))
   }
 
   /**
@@ -147,6 +147,7 @@ private[sql] object INT extends NativeColumnType(IntegerType, 4) {
 
   override def getField(row: InternalRow, ordinal: Int): Int = row.getInt(ordinal)
 
+
   override def copyField(from: InternalRow, fromOrdinal: Int, to: MutableRow, toOrdinal: Int) {
     to.setInt(toOrdinal, from.getInt(fromOrdinal))
   }
@@ -324,15 +325,18 @@ private[sql] object STRING extends NativeColumnType(StringType, 8) {
   }
 
   override def append(v: UTF8String, buffer: ByteBuffer): Unit = {
-    val stringBytes = v.getBytes
-    buffer.putInt(stringBytes.length).put(stringBytes, 0, stringBytes.length)
+    buffer.putInt(v.numBytes())
+    v.writeTo(buffer)
   }
 
   override def extract(buffer: ByteBuffer): UTF8String = {
     val length = buffer.getInt()
-    val stringBytes = new Array[Byte](length)
-    buffer.get(stringBytes, 0, length)
-    UTF8String.fromBytes(stringBytes)
+    assert(buffer.hasArray)
+    val base = buffer.array()
+    val offset = buffer.arrayOffset()
+    val cursor = buffer.position()
+    buffer.position(cursor + length)
+    UTF8String.fromBytes(base, offset + cursor, length)
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: UTF8String): Unit = {
@@ -386,11 +390,6 @@ private[sql] sealed abstract class ByteArrayColumnType[JvmType](val defaultSize:
   def serialize(value: JvmType): Array[Byte]
   def deserialize(bytes: Array[Byte]): JvmType
 
-  override def actualSize(row: InternalRow, ordinal: Int): Int = {
-    // TODO: grow the buffer in append(), so serialize() will not be called twice
-    serialize(getField(row, ordinal)).length + 4
-  }
-
   override def append(v: JvmType, buffer: ByteBuffer): Unit = {
     val bytes = serialize(v)
     buffer.putInt(bytes.length).put(bytes, 0, bytes.length)
@@ -416,6 +415,10 @@ private[sql] object BINARY extends ByteArrayColumnType[Array[Byte]](16) {
     row.getBinary(ordinal)
   }
 
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    row.getBinary(ordinal).length + 4
+  }
+
   def serialize(value: Array[Byte]): Array[Byte] = value
   def deserialize(bytes: Array[Byte]): Array[Byte] = bytes
 }
@@ -433,6 +436,10 @@ private[sql] case class LARGE_DECIMAL(precision: Int, scale: Int)
     row.setDecimal(ordinal, value, precision)
   }
 
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    4 + getField(row, ordinal).toJavaBigDecimal.unscaledValue().bitLength() / 8 + 1
+  }
+
   override def serialize(value: Decimal): Array[Byte] = {
     value.toJavaBigDecimal.unscaledValue().toByteArray
   }
@@ -449,124 +456,118 @@ private[sql] object LARGE_DECIMAL {
   }
 }
 
-private[sql] case class STRUCT(dataType: StructType)
-  extends ByteArrayColumnType[InternalRow](20) {
+private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRow] {
 
-  private val projection: UnsafeProjection =
-    UnsafeProjection.create(dataType)
   private val numOfFields: Int = dataType.fields.size
 
-  override def setField(row: MutableRow, ordinal: Int, value: InternalRow): Unit = {
+  override def defaultSize: Int = 20
+
+  override def setField(row: MutableRow, ordinal: Int, value: UnsafeRow): Unit = {
     row.update(ordinal, value)
   }
 
-  override def getField(row: InternalRow, ordinal: Int): InternalRow = {
-    row.getStruct(ordinal, numOfFields)
+  override def getField(row: InternalRow, ordinal: Int): UnsafeRow = {
+    row.getStruct(ordinal, numOfFields).asInstanceOf[UnsafeRow]
   }
 
-  override def serialize(value: InternalRow): Array[Byte] = {
-    val unsafeRow = if (value.isInstanceOf[UnsafeRow]) {
-      value.asInstanceOf[UnsafeRow]
-    } else {
-      projection(value)
-    }
-    unsafeRow.getBytes
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    4 + getField(row, ordinal).getSizeInBytes
   }
 
-  override def deserialize(bytes: Array[Byte]): InternalRow = {
+  override def append(value: UnsafeRow, buffer: ByteBuffer): Unit = {
+    buffer.putInt(value.getSizeInBytes)
+    value.writeTo(buffer)
+  }
+
+  override def extract(buffer: ByteBuffer): UnsafeRow = {
+    val sizeInBytes = buffer.getInt()
+    assert(buffer.hasArray)
+    val base = buffer.array()
+    val offset = buffer.arrayOffset()
+    val cursor = buffer.position()
+    buffer.position(cursor + sizeInBytes)
     val unsafeRow = new UnsafeRow
-    unsafeRow.pointTo(bytes, numOfFields, bytes.length)
+    unsafeRow.pointTo(base, Platform.BYTE_ARRAY_OFFSET + offset + cursor, numOfFields, sizeInBytes)
     unsafeRow
   }
 
-  override def clone(v: InternalRow): InternalRow = v.copy()
+  override def clone(v: UnsafeRow): UnsafeRow = v.copy()
 }
 
-private[sql] case class ARRAY(dataType: ArrayType)
-  extends ByteArrayColumnType[ArrayData](16) {
+private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArrayData] {
 
-  private lazy val projection = UnsafeProjection.create(Array[DataType](dataType))
-  private val mutableRow = new GenericMutableRow(new Array[Any](1))
+  override def defaultSize: Int = 16
 
-  override def setField(row: MutableRow, ordinal: Int, value: ArrayData): Unit = {
+  override def setField(row: MutableRow, ordinal: Int, value: UnsafeArrayData): Unit = {
     row.update(ordinal, value)
   }
 
-  override def getField(row: InternalRow, ordinal: Int): ArrayData = {
-    row.getArray(ordinal)
+  override def getField(row: InternalRow, ordinal: Int): UnsafeArrayData = {
+    row.getArray(ordinal).asInstanceOf[UnsafeArrayData]
   }
 
-  override def serialize(value: ArrayData): Array[Byte] = {
-    val unsafeArray = if (value.isInstanceOf[UnsafeArrayData]) {
-      value.asInstanceOf[UnsafeArrayData]
-    } else {
-      mutableRow(0) = value
-      projection(mutableRow).getArray(0)
-    }
-    val outputBuffer =
-      ByteBuffer.allocate(4 + unsafeArray.getSizeInBytes).order(ByteOrder.nativeOrder())
-    outputBuffer.putInt(unsafeArray.numElements())
-    val underlying = outputBuffer.array()
-    unsafeArray.writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 4)
-    underlying
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    val unsafeArray = getField(row, ordinal)
+    4 + 4 + unsafeArray.getSizeInBytes
   }
 
-  override def deserialize(bytes: Array[Byte]): ArrayData = {
-    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.nativeOrder())
-    val numElements = buffer.getInt
-    val array = new UnsafeArrayData
-    array.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 4, numElements, bytes.length - 4)
-    array
+  override def append(value: UnsafeArrayData, buffer: ByteBuffer): Unit = {
+    buffer.putInt(4 + value.getSizeInBytes)
+    buffer.putInt(value.numElements())
+    value.writeTo(buffer)
   }
 
-  override def clone(v: ArrayData): ArrayData = v.copy()
+  override def extract(buffer: ByteBuffer): UnsafeArrayData = {
+    val numBytes = buffer.getInt
+    assert(buffer.hasArray)
+    val cursor = buffer.position()
+    buffer.position(cursor + numBytes)
+    UnsafeReaders.readArray(
+      buffer.array(),
+      Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
+      numBytes)
+  }
+
+  override def clone(v: UnsafeArrayData): UnsafeArrayData = v.copy()
 }
 
-private[sql] case class MAP(dataType: MapType) extends ByteArrayColumnType[MapData](32) {
+private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData] {
 
-  private lazy val projection: UnsafeProjection = UnsafeProjection.create(Array[DataType](dataType))
-  private val mutableRow = new GenericMutableRow(new Array[Any](1))
+  override def defaultSize: Int = 32
 
-  override def setField(row: MutableRow, ordinal: Int, value: MapData): Unit = {
+  override def setField(row: MutableRow, ordinal: Int, value: UnsafeMapData): Unit = {
     row.update(ordinal, value)
   }
 
-  override def getField(row: InternalRow, ordinal: Int): MapData = {
-    row.getMap(ordinal)
+  override def getField(row: InternalRow, ordinal: Int): UnsafeMapData = {
+    row.getMap(ordinal).asInstanceOf[UnsafeMapData]
   }
 
-  override def serialize(value: MapData): Array[Byte] = {
-    val unsafeMap = if (value.isInstanceOf[UnsafeMapData]) {
-      value.asInstanceOf[UnsafeMapData]
-    } else {
-      mutableRow(0) = value
-      projection(mutableRow).getMap(0)
-    }
+  override def actualSize(row: InternalRow, ordinal: Int): Int = {
+    val unsafeMap = getField(row, ordinal)
+    12 + unsafeMap.keyArray().getSizeInBytes + unsafeMap.valueArray().getSizeInBytes
+  }
+
+  override def append(value: UnsafeMapData, buffer: ByteBuffer): Unit = {
+    buffer.putInt(8 + value.keyArray().getSizeInBytes + value.valueArray().getSizeInBytes)
+    buffer.putInt(value.numElements())
+    buffer.putInt(value.keyArray().getSizeInBytes)
+    value.keyArray().writeTo(buffer)
+    value.valueArray().writeTo(buffer)
+  }
+
+  override def extract(buffer: ByteBuffer): UnsafeMapData = {
+    val numBytes = buffer.getInt
+    assert(buffer.hasArray)
+    val cursor = buffer.position()
+    buffer.position(cursor + numBytes)
+    UnsafeReaders.readMap(
+      buffer.array(),
+      Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
+      numBytes)
+  }
 
-    val outputBuffer =
-      ByteBuffer.allocate(8 + unsafeMap.getSizeInBytes).order(ByteOrder.nativeOrder())
-    outputBuffer.putInt(unsafeMap.numElements())
-    val keyBytes = unsafeMap.keyArray().getSizeInBytes
-    outputBuffer.putInt(keyBytes)
-    val underlying = outputBuffer.array()
-    unsafeMap.keyArray().writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 8)
-    unsafeMap.valueArray().writeToMemory(underlying, Platform.BYTE_ARRAY_OFFSET + 8 + keyBytes)
-    underlying
-  }
-
-  override def deserialize(bytes: Array[Byte]): MapData = {
-    val buffer = ByteBuffer.wrap(bytes).order(ByteOrder.nativeOrder())
-    val numElements = buffer.getInt
-    val keyArraySize = buffer.getInt
-    val keyArray = new UnsafeArrayData
-    val valueArray = new UnsafeArrayData
-    keyArray.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 8, numElements, keyArraySize)
-    valueArray.pointTo(bytes, Platform.BYTE_ARRAY_OFFSET + 8 + keyArraySize, numElements,
-      bytes.length - 8 - keyArraySize)
-    new UnsafeMapData(keyArray, valueArray)
-  }
-
-  override def clone(v: MapData): MapData = v.copy()
+  override def clone(v: UnsafeMapData): UnsafeMapData = v.copy()
 }
 
 private[sql] object ColumnType {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index d7e145f9c2bb8..d967814f627cb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
-import org.apache.spark.sql.execution.{LeafNode, SparkPlan}
+import org.apache.spark.sql.execution.{ConvertToUnsafe, LeafNode, SparkPlan}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{Accumulable, Accumulator, Accumulators}
 
@@ -38,7 +38,9 @@ private[sql] object InMemoryRelation {
       storageLevel: StorageLevel,
       child: SparkPlan,
       tableName: Option[String]): InMemoryRelation =
-    new InMemoryRelation(child.output, useCompression, batchSize, storageLevel, child, tableName)()
+    new InMemoryRelation(child.output, useCompression, batchSize, storageLevel,
+      if (child.outputsUnsafeRows) child else ConvertToUnsafe(child),
+      tableName)()
 }
 
 private[sql] case class CachedBatch(buffers: Array[Array[Byte]], stats: InternalRow)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index ceb8ad97bb320..0e6e1bcf72896 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -17,11 +17,11 @@
 
 package org.apache.spark.sql.columnar
 
-import java.nio.ByteBuffer
+import java.nio.{ByteOrder, ByteBuffer}
 
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow}
 import org.apache.spark.sql.columnar.ColumnarTestUtils._
 import org.apache.spark.sql.types._
 import org.apache.spark.{Logging, SparkFunSuite}
@@ -55,7 +55,8 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
       assertResult(expected, s"Wrong actualSize for $columnType") {
         val row = new GenericMutableRow(1)
         row.update(0, CatalystTypeConverters.convertToCatalyst(value))
-        columnType.actualSize(row, 0)
+        val proj = UnsafeProjection.create(Array[DataType](columnType.dataType))
+        columnType.actualSize(proj(row), 0)
       }
     }
 
@@ -99,35 +100,27 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
 
   def testColumnType[JvmType](columnType: ColumnType[JvmType]): Unit = {
 
-    val buffer = ByteBuffer.allocate(DEFAULT_BUFFER_SIZE)
-    val seq = (0 until 4).map(_ => makeRandomValue(columnType))
+    val buffer = ByteBuffer.allocate(DEFAULT_BUFFER_SIZE).order(ByteOrder.nativeOrder())
+    val proj = UnsafeProjection.create(Array[DataType](columnType.dataType))
     val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType)
+    val seq = (0 until 4).map(_ => proj(makeRandomRow(columnType)).copy())
 
     test(s"$columnType append/extract") {
       buffer.rewind()
-      seq.foreach(columnType.append(_, buffer))
+      seq.foreach(columnType.append(_, 0, buffer))
 
       buffer.rewind()
-      seq.foreach { expected =>
-        logInfo("buffer = " + buffer + ", expected = " + expected)
-        val extracted = columnType.extract(buffer)
-        assert(
-          converter(expected) === converter(extracted),
-          "Extracted value didn't equal to the original one. " +
-            hexDump(expected) + " != " + hexDump(extracted) +
-            ", buffer = " + dumpBuffer(buffer.duplicate().rewind().asInstanceOf[ByteBuffer]))
+      seq.foreach { row =>
+        logInfo("buffer = " + buffer + ", expected = " + row)
+        val expected = converter(row.get(0, columnType.dataType))
+        val extracted = converter(columnType.extract(buffer))
+        assert(expected === extracted,
+          s"Extracted value didn't equal to the original one. $expected != $extracted, buffer =" +
+          dumpBuffer(buffer.duplicate().rewind().asInstanceOf[ByteBuffer]))
       }
     }
   }
 
-  private def hexDump(value: Any): String = {
-    if (value == null) {
-      ""
-    } else {
-      value.toString.map(ch => Integer.toHexString(ch & 0xffff)).mkString(" ")
-    }
-  }
-
   private def dumpBuffer(buff: ByteBuffer): Any = {
     val sb = new StringBuilder()
     while (buff.hasRemaining) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
index 78cebbf3cc934..aa1605fee8c73 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnAccessorSuite.scala
@@ -21,7 +21,7 @@ import java.nio.ByteBuffer
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow}
 import org.apache.spark.sql.types._
 
 class TestNullableColumnAccessor[JvmType](
@@ -64,10 +64,11 @@ class NullableColumnAccessorSuite extends SparkFunSuite {
     test(s"Nullable $typeName column accessor: access null values") {
       val builder = TestNullableColumnBuilder(columnType)
       val randomRow = makeRandomRow(columnType)
+      val proj = UnsafeProjection.create(Array[DataType](columnType.dataType))
 
       (0 until 4).foreach { _ =>
-        builder.appendFrom(randomRow, 0)
-        builder.appendFrom(nullRow, 0)
+        builder.appendFrom(proj(randomRow), 0)
+        builder.appendFrom(proj(nullRow), 0)
       }
 
       val accessor = TestNullableColumnAccessor(builder.build(), columnType)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
index fba08e626d720..91404577832a0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/NullableColumnBuilderSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.columnar
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.CatalystTypeConverters
-import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
+import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow}
 import org.apache.spark.sql.types._
 
 class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType])
@@ -51,6 +51,9 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       columnType: ColumnType[JvmType]): Unit = {
 
     val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
+    val dataType = columnType.dataType
+    val proj = UnsafeProjection.create(Array[DataType](dataType))
+    val converter = CatalystTypeConverters.createToScalaConverter(dataType)
 
     test(s"$typeName column builder: empty column") {
       val columnBuilder = TestNullableColumnBuilder(columnType)
@@ -65,7 +68,7 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       val randomRow = makeRandomRow(columnType)
 
       (0 until 4).foreach { _ =>
-        columnBuilder.appendFrom(randomRow, 0)
+        columnBuilder.appendFrom(proj(randomRow), 0)
       }
 
       val buffer = columnBuilder.build()
@@ -77,12 +80,10 @@ class NullableColumnBuilderSuite extends SparkFunSuite {
       val columnBuilder = TestNullableColumnBuilder(columnType)
       val randomRow = makeRandomRow(columnType)
       val nullRow = makeNullRow(1)
-      val dataType = columnType.dataType
-      val converter = CatalystTypeConverters.createToScalaConverter(dataType)
 
       (0 until 4).foreach { _ =>
-        columnBuilder.appendFrom(randomRow, 0)
-        columnBuilder.appendFrom(nullRow, 0)
+        columnBuilder.appendFrom(proj(randomRow), 0)
+        columnBuilder.appendFrom(proj(nullRow), 0)
       }
 
       val buffer = columnBuilder.build()
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 216aeea60d1c8..b7aecb5102ba6 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -19,6 +19,7 @@
 
 import javax.annotation.Nonnull;
 import java.io.*;
+import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.Arrays;
 import java.util.Map;
@@ -137,6 +138,15 @@ public void writeToMemory(Object target, long targetOffset) {
     Platform.copyMemory(base, offset, target, targetOffset, numBytes);
   }
 
+  public void writeTo(ByteBuffer buffer) {
+    assert(buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + numBytes);
+  }
+
   /**
    * Returns the number of bytes for a code point with the first byte as `b`
    * @param b The first byte of a code point

From 626aab79c9b4d4ac9d65bf5fa45b81dd9cbc609c Mon Sep 17 00:00:00 2001
From: Lianhui Wang <lianhuiwang09@gmail.com>
Date: Tue, 13 Oct 2015 08:29:47 -0500
Subject: [PATCH 030/139] [SPARK-11026] [YARN] spark.yarn.user.classpath.first
 does work for 'spark-submit --jars hdfs://user/foo.jar'

when spark.yarn.user.classpath.first=true and using 'spark-submit --jars hdfs://user/foo.jar', it can not put foo.jar to system classpath. so we need to put yarn's linkNames of jars to the system classpath. vanzin tgravescs

Author: Lianhui Wang <lianhuiwang09@gmail.com>

Closes #9045 from lianhuiwang/spark-11026.
---
 .../org/apache/spark/deploy/yarn/Client.scala | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index d25d830fd4349..9fcfe362a3ba2 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1212,7 +1212,7 @@ object Client extends Logging {
         } else {
           getMainJarUri(sparkConf.getOption(CONF_SPARK_USER_JAR))
         }
-      mainJar.foreach(addFileToClasspath(sparkConf, _, APP_JAR, env))
+      mainJar.foreach(addFileToClasspath(sparkConf, conf, _, APP_JAR, env))
 
       val secondaryJars =
         if (args != null) {
@@ -1221,10 +1221,10 @@ object Client extends Logging {
           getSecondaryJarUris(sparkConf.getOption(CONF_SPARK_YARN_SECONDARY_JARS))
         }
       secondaryJars.foreach { x =>
-        addFileToClasspath(sparkConf, x, null, env)
+        addFileToClasspath(sparkConf, conf, x, null, env)
       }
     }
-    addFileToClasspath(sparkConf, new URI(sparkJar(sparkConf)), SPARK_JAR, env)
+    addFileToClasspath(sparkConf, conf, new URI(sparkJar(sparkConf)), SPARK_JAR, env)
     populateHadoopClasspath(conf, env)
     sys.env.get(ENV_DIST_CLASSPATH).foreach { cp =>
       addClasspathEntry(getClusterPath(sparkConf, cp), env)
@@ -1259,15 +1259,17 @@ object Client extends Logging {
    * If an alternate name for the file is given, and it's not a "local:" file, the alternate
    * name will be added to the classpath (relative to the job's work directory).
    *
-   * If not a "local:" file and no alternate name, the environment is not modified.
+   * If not a "local:" file and no alternate name, the linkName will be added to the classpath.
    *
-   * @param conf      Spark configuration.
-   * @param uri       URI to add to classpath (optional).
-   * @param fileName  Alternate name for the file (optional).
-   * @param env       Map holding the environment variables.
+   * @param conf        Spark configuration.
+   * @param hadoopConf  Hadoop configuration.
+   * @param uri         URI to add to classpath (optional).
+   * @param fileName    Alternate name for the file (optional).
+   * @param env         Map holding the environment variables.
    */
   private def addFileToClasspath(
       conf: SparkConf,
+      hadoopConf: Configuration,
       uri: URI,
       fileName: String,
       env: HashMap[String, String]): Unit = {
@@ -1276,6 +1278,11 @@ object Client extends Logging {
     } else if (fileName != null) {
       addClasspathEntry(buildPath(
         YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), fileName), env)
+    } else if (uri != null) {
+      val localPath = getQualifiedLocalPath(uri, hadoopConf)
+      val linkName = Option(uri.getFragment()).getOrElse(localPath.getName())
+      addClasspathEntry(buildPath(
+        YarnSparkHadoopUtil.expandEnvironment(Environment.PWD), linkName), env)
     }
   }
 

From 6987c067937a50867b4d5788f5bf496ecdfdb62c Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 13 Oct 2015 09:40:36 -0700
Subject: [PATCH 031/139] [SPARK-11009] [SQL] fix wrong result of Window
 function in cluster mode

Currently, All windows function could generate wrong result in cluster sometimes.

The root cause is that AttributeReference is called in executor, then id of it may not be unique than others created in driver.

Here is the script that could reproduce the problem (run in local cluster):
```
from pyspark import SparkContext, HiveContext
from pyspark.sql.window import Window
from pyspark.sql.functions import rowNumber

sqlContext = HiveContext(SparkContext())
sqlContext.setConf("spark.sql.shuffle.partitions", "3")
df =  sqlContext.range(1<<20)
df2 = df.select((df.id % 1000).alias("A"), (df.id / 1000).alias('B'))
ws = Window.partitionBy(df2.A).orderBy(df2.B)
df3 = df2.select("client", "date", rowNumber().over(ws).alias("rn")).filter("rn < 0")
assert df3.count() == 0
```

Author: Davies Liu <davies@databricks.com>
Author: Yin Huai <yhuai@databricks.com>

Closes #9050 from davies/wrong_window.
---
 .../apache/spark/sql/execution/Window.scala   | 20 ++++-----
 .../spark/sql/hive/HiveSparkSubmitSuite.scala | 41 +++++++++++++++++++
 2 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index f8929530c5036..55035f4bc5f2a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -145,11 +145,10 @@ case class Window(
         // Construct the ordering. This is used to compare the result of current value projection
         // to the result of bound value projection. This is done manually because we want to use
         // Code Generation (if it is enabled).
-        val (sortExprs, schema) = exprs.map { case e =>
-          val ref = AttributeReference("ordExpr", e.dataType, e.nullable)()
-          (SortOrder(ref, e.direction), ref)
-        }.unzip
-        val ordering = newOrdering(sortExprs, schema)
+        val sortExprs = exprs.zipWithIndex.map { case (e, i) =>
+          SortOrder(BoundReference(i, e.dataType, e.nullable), e.direction)
+        }
+        val ordering = newOrdering(sortExprs, Nil)
         RangeBoundOrdering(ordering, current, bound)
       case RowFrame => RowBoundOrdering(offset)
     }
@@ -205,14 +204,15 @@ case class Window(
    */
   private[this] def createResultProjection(
       expressions: Seq[Expression]): MutableProjection = {
-    val unboundToAttr = expressions.map {
-      e => (e, AttributeReference("windowResult", e.dataType, e.nullable)())
+    val references = expressions.zipWithIndex.map{ case (e, i) =>
+      // Results of window expressions will be on the right side of child's output
+      BoundReference(child.output.size + i, e.dataType, e.nullable)
     }
-    val unboundToAttrMap = unboundToAttr.toMap
-    val patchedWindowExpression = windowExpression.map(_.transform(unboundToAttrMap))
+    val unboundToRefMap = expressions.zip(references).toMap
+    val patchedWindowExpression = windowExpression.map(_.transform(unboundToRefMap))
     newMutableProjection(
       projectList ++ patchedWindowExpression,
-      child.output ++ unboundToAttr.map(_._2))()
+      child.output)()
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
index 5f1660b62d418..10e4ae2c50308 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveSparkSubmitSuite.scala
@@ -30,6 +30,7 @@ import org.scalatest.time.SpanSugar._
 
 import org.apache.spark._
 import org.apache.spark.sql.{SQLContext, QueryTest}
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
 import org.apache.spark.sql.test.ProcessTestUtils.ProcessOutputCapturer
 import org.apache.spark.sql.types.DecimalType
@@ -107,6 +108,16 @@ class HiveSparkSubmitSuite
     runSparkSubmit(args)
   }
 
+  test("SPARK-11009 fix wrong result of Window function in cluster mode") {
+    val unusedJar = TestUtils.createJarWithClasses(Seq.empty)
+    val args = Seq(
+      "--class", SPARK_11009.getClass.getName.stripSuffix("$"),
+      "--name", "SparkSQLConfTest",
+      "--master", "local-cluster[2,1,1024]",
+      unusedJar.toString)
+    runSparkSubmit(args)
+  }
+
   // NOTE: This is an expensive operation in terms of time (10 seconds+). Use sparingly.
   // This is copied from org.apache.spark.deploy.SparkSubmitSuite
   private def runSparkSubmit(args: Seq[String]): Unit = {
@@ -320,3 +331,33 @@ object SPARK_9757 extends QueryTest {
     }
   }
 }
+
+object SPARK_11009 extends QueryTest {
+  import org.apache.spark.sql.functions._
+
+  protected var sqlContext: SQLContext = _
+
+  def main(args: Array[String]): Unit = {
+    Utils.configTestLog4j("INFO")
+
+    val sparkContext = new SparkContext(
+      new SparkConf()
+        .set("spark.ui.enabled", "false")
+        .set("spark.sql.shuffle.partitions", "100"))
+
+    val hiveContext = new TestHiveContext(sparkContext)
+    sqlContext = hiveContext
+
+    try {
+      val df = sqlContext.range(1 << 20)
+      val df2 = df.select((df("id") % 1000).alias("A"), (df("id") / 1000).alias("B"))
+      val ws = Window.partitionBy(df2("A")).orderBy(df2("B"))
+      val df3 = df2.select(df2("A"), df2("B"), rowNumber().over(ws).alias("rn")).filter("rn < 0")
+      if (df3.rdd.count() != 0) {
+        throw new Exception("df3 should have 0 output row.")
+      }
+    } finally {
+      sparkContext.stop()
+    }
+  }
+}

From 1797055dbf1d2fd7714d7c65c8d2efde2f15efc1 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Tue, 13 Oct 2015 09:51:20 -0700
Subject: [PATCH 032/139] [SPARK-11079] Post-hoc review Netty-based RPC - round
 1

I'm going through the implementation right now for post-doc review. Adding more comments and renaming things as I go through them.

I also want to write higher level documentation about how the whole thing works -- but those will come in other pull requests.

Author: Reynold Xin <rxin@databricks.com>

Closes #9091 from rxin/rpc-review.
---
 .../org/apache/spark/MapOutputTracker.scala   |   2 +-
 .../org/apache/spark/rpc/RpcAddress.scala     |  50 ++++++
 .../org/apache/spark/rpc/RpcEndpoint.scala    |   3 +-
 .../scala/org/apache/spark/rpc/RpcEnv.scala   | 153 +-----------------
 .../org/apache/spark/rpc/RpcTimeout.scala     | 131 +++++++++++++++
 .../apache/spark/rpc/akka/AkkaRpcEnv.scala    |   4 -
 .../apache/spark/rpc/netty/Dispatcher.scala   | 108 +++++++------
 .../apache/spark/rpc/netty/IDVerifier.scala   |   4 +-
 .../org/apache/spark/rpc/netty/Inbox.scala    | 119 ++++++--------
 .../spark/rpc/netty/NettyRpcCallContext.scala |  11 +-
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  |  38 +++--
 .../org/apache/spark/util/ThreadUtils.scala   |   1 -
 .../scala/org/apache/spark/util/Utils.scala   |   1 +
 .../apache/spark/rpc/netty/InboxSuite.scala   |   6 +-
 .../rpc/netty/NettyRpcHandlerSuite.scala      |   7 +-
 15 files changed, 336 insertions(+), 302 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala
 create mode 100644 core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 45e12e40c837f..72355cdfa68b3 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -48,7 +48,7 @@ private[spark] class MapOutputTrackerMasterEndpoint(
       val hostPort = context.senderAddress.hostPort
       logInfo("Asked to send map output locations for shuffle " + shuffleId + " to " + hostPort)
       val mapOutputStatuses = tracker.getSerializedMapOutputStatuses(shuffleId)
-      val serializedSize = mapOutputStatuses.size
+      val serializedSize = mapOutputStatuses.length
       if (serializedSize > maxAkkaFrameSize) {
         val msg = s"Map output statuses were $serializedSize bytes which " +
           s"exceeds spark.akka.frameSize ($maxAkkaFrameSize bytes)."
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala b/core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala
new file mode 100644
index 0000000000000..eb0b26947f504
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcAddress.scala
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc
+
+import org.apache.spark.util.Utils
+
+
+/**
+ * Address for an RPC environment, with hostname and port.
+ */
+private[spark] case class RpcAddress(host: String, port: Int) {
+
+  def hostPort: String = host + ":" + port
+
+  /** Returns a string in the form of "spark://host:port". */
+  def toSparkURL: String = "spark://" + hostPort
+
+  override def toString: String = hostPort
+}
+
+
+private[spark] object RpcAddress {
+
+  /** Return the [[RpcAddress]] represented by `uri`. */
+  def fromURIString(uri: String): RpcAddress = {
+    val uriObj = new java.net.URI(uri)
+    RpcAddress(uriObj.getHost, uriObj.getPort)
+  }
+
+  /** Returns the [[RpcAddress]] encoded in the form of "spark://host:port" */
+  def fromSparkURL(sparkUrl: String): RpcAddress = {
+    val (host, port) = Utils.extractHostPortFromSparkUrl(sparkUrl)
+    RpcAddress(host, port)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
index f1ddc6d2cd438..0ba95169529e6 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEndpoint.scala
@@ -145,5 +145,4 @@ private[spark] trait RpcEndpoint {
  * However, there is no guarantee that the same thread will be executing the same
  * [[ThreadSafeRpcEndpoint]] for different messages.
  */
-private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint {
-}
+private[spark] trait ThreadSafeRpcEndpoint extends RpcEndpoint
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index 35e402c725331..ef491a0ae4f09 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -17,12 +17,7 @@
 
 package org.apache.spark.rpc
 
-import java.net.URI
-import java.util.concurrent.TimeoutException
-
-import scala.concurrent.{Awaitable, Await, Future}
-import scala.concurrent.duration._
-import scala.language.postfixOps
+import scala.concurrent.Future
 
 import org.apache.spark.{SecurityManager, SparkConf}
 import org.apache.spark.util.{RpcUtils, Utils}
@@ -35,8 +30,8 @@ import org.apache.spark.util.{RpcUtils, Utils}
 private[spark] object RpcEnv {
 
   private def getRpcEnvFactory(conf: SparkConf): RpcEnvFactory = {
-    // Add more RpcEnv implementations here
-    val rpcEnvNames = Map("akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory",
+    val rpcEnvNames = Map(
+      "akka" -> "org.apache.spark.rpc.akka.AkkaRpcEnvFactory",
       "netty" -> "org.apache.spark.rpc.netty.NettyRpcEnvFactory")
     val rpcEnvName = conf.get("spark.rpc", "netty")
     val rpcEnvFactoryClassName = rpcEnvNames.getOrElse(rpcEnvName.toLowerCase, rpcEnvName)
@@ -53,7 +48,6 @@ private[spark] object RpcEnv {
     val config = RpcEnvConfig(conf, name, host, port, securityManager)
     getRpcEnvFactory(conf).create(config)
   }
-
 }
 
 
@@ -155,144 +149,3 @@ private[spark] case class RpcEnvConfig(
     host: String,
     port: Int,
     securityManager: SecurityManager)
-
-
-/**
- * Represents a host and port.
- */
-private[spark] case class RpcAddress(host: String, port: Int) {
-  // TODO do we need to add the type of RpcEnv in the address?
-
-  val hostPort: String = host + ":" + port
-
-  override val toString: String = hostPort
-
-  def toSparkURL: String = "spark://" + hostPort
-}
-
-
-private[spark] object RpcAddress {
-
-  /**
-   * Return the [[RpcAddress]] represented by `uri`.
-   */
-  def fromURI(uri: URI): RpcAddress = {
-    RpcAddress(uri.getHost, uri.getPort)
-  }
-
-  /**
-   * Return the [[RpcAddress]] represented by `uri`.
-   */
-  def fromURIString(uri: String): RpcAddress = {
-    fromURI(new java.net.URI(uri))
-  }
-
-  def fromSparkURL(sparkUrl: String): RpcAddress = {
-    val (host, port) = Utils.extractHostPortFromSparkUrl(sparkUrl)
-    RpcAddress(host, port)
-  }
-}
-
-
-/**
- * An exception thrown if RpcTimeout modifies a [[TimeoutException]].
- */
-private[rpc] class RpcTimeoutException(message: String, cause: TimeoutException)
-  extends TimeoutException(message) { initCause(cause) }
-
-
-/**
- * Associates a timeout with a description so that a when a TimeoutException occurs, additional
- * context about the timeout can be amended to the exception message.
- * @param duration timeout duration in seconds
- * @param timeoutProp the configuration property that controls this timeout
- */
-private[spark] class RpcTimeout(val duration: FiniteDuration, val timeoutProp: String)
-  extends Serializable {
-
-  /** Amends the standard message of TimeoutException to include the description */
-  private def createRpcTimeoutException(te: TimeoutException): RpcTimeoutException = {
-    new RpcTimeoutException(te.getMessage() + ". This timeout is controlled by " + timeoutProp, te)
-  }
-
-  /**
-   * PartialFunction to match a TimeoutException and add the timeout description to the message
-   *
-   * @note This can be used in the recover callback of a Future to add to a TimeoutException
-   * Example:
-   *    val timeout = new RpcTimeout(5 millis, "short timeout")
-   *    Future(throw new TimeoutException).recover(timeout.addMessageIfTimeout)
-   */
-  def addMessageIfTimeout[T]: PartialFunction[Throwable, T] = {
-    // The exception has already been converted to a RpcTimeoutException so just raise it
-    case rte: RpcTimeoutException => throw rte
-    // Any other TimeoutException get converted to a RpcTimeoutException with modified message
-    case te: TimeoutException => throw createRpcTimeoutException(te)
-  }
-
-  /**
-   * Wait for the completed result and return it. If the result is not available within this
-   * timeout, throw a [[RpcTimeoutException]] to indicate which configuration controls the timeout.
-   * @param  awaitable  the `Awaitable` to be awaited
-   * @throws RpcTimeoutException if after waiting for the specified time `awaitable`
-   *         is still not ready
-   */
-  def awaitResult[T](awaitable: Awaitable[T]): T = {
-    try {
-      Await.result(awaitable, duration)
-    } catch addMessageIfTimeout
-  }
-}
-
-
-private[spark] object RpcTimeout {
-
-  /**
-   * Lookup the timeout property in the configuration and create
-   * a RpcTimeout with the property key in the description.
-   * @param conf configuration properties containing the timeout
-   * @param timeoutProp property key for the timeout in seconds
-   * @throws NoSuchElementException if property is not set
-   */
-  def apply(conf: SparkConf, timeoutProp: String): RpcTimeout = {
-    val timeout = { conf.getTimeAsSeconds(timeoutProp) seconds }
-    new RpcTimeout(timeout, timeoutProp)
-  }
-
-  /**
-   * Lookup the timeout property in the configuration and create
-   * a RpcTimeout with the property key in the description.
-   * Uses the given default value if property is not set
-   * @param conf configuration properties containing the timeout
-   * @param timeoutProp property key for the timeout in seconds
-   * @param defaultValue default timeout value in seconds if property not found
-   */
-  def apply(conf: SparkConf, timeoutProp: String, defaultValue: String): RpcTimeout = {
-    val timeout = { conf.getTimeAsSeconds(timeoutProp, defaultValue) seconds }
-    new RpcTimeout(timeout, timeoutProp)
-  }
-
-  /**
-   * Lookup prioritized list of timeout properties in the configuration
-   * and create a RpcTimeout with the first set property key in the
-   * description.
-   * Uses the given default value if property is not set
-   * @param conf configuration properties containing the timeout
-   * @param timeoutPropList prioritized list of property keys for the timeout in seconds
-   * @param defaultValue default timeout value in seconds if no properties found
-   */
-  def apply(conf: SparkConf, timeoutPropList: Seq[String], defaultValue: String): RpcTimeout = {
-    require(timeoutPropList.nonEmpty)
-
-    // Find the first set property or use the default value with the first property
-    val itr = timeoutPropList.iterator
-    var foundProp: Option[(String, String)] = None
-    while (itr.hasNext && foundProp.isEmpty){
-      val propKey = itr.next()
-      conf.getOption(propKey).foreach { prop => foundProp = Some(propKey, prop) }
-    }
-    val finalProp = foundProp.getOrElse(timeoutPropList.head, defaultValue)
-    val timeout = { Utils.timeStringAsSeconds(finalProp._2) seconds }
-    new RpcTimeout(timeout, finalProp._1)
-  }
-}
diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala
new file mode 100644
index 0000000000000..285786ebf9f1b
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcTimeout.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rpc
+
+import java.util.concurrent.TimeoutException
+
+import scala.concurrent.{Awaitable, Await}
+import scala.concurrent.duration._
+
+import org.apache.spark.SparkConf
+import org.apache.spark.util.Utils
+
+
+/**
+ * An exception thrown if RpcTimeout modifies a [[TimeoutException]].
+ */
+private[rpc] class RpcTimeoutException(message: String, cause: TimeoutException)
+  extends TimeoutException(message) { initCause(cause) }
+
+
+/**
+ * Associates a timeout with a description so that a when a TimeoutException occurs, additional
+ * context about the timeout can be amended to the exception message.
+ *
+ * @param duration timeout duration in seconds
+ * @param timeoutProp the configuration property that controls this timeout
+ */
+private[spark] class RpcTimeout(val duration: FiniteDuration, val timeoutProp: String)
+  extends Serializable {
+
+  /** Amends the standard message of TimeoutException to include the description */
+  private def createRpcTimeoutException(te: TimeoutException): RpcTimeoutException = {
+    new RpcTimeoutException(te.getMessage + ". This timeout is controlled by " + timeoutProp, te)
+  }
+
+  /**
+   * PartialFunction to match a TimeoutException and add the timeout description to the message
+   *
+   * @note This can be used in the recover callback of a Future to add to a TimeoutException
+   * Example:
+   *    val timeout = new RpcTimeout(5 millis, "short timeout")
+   *    Future(throw new TimeoutException).recover(timeout.addMessageIfTimeout)
+   */
+  def addMessageIfTimeout[T]: PartialFunction[Throwable, T] = {
+    // The exception has already been converted to a RpcTimeoutException so just raise it
+    case rte: RpcTimeoutException => throw rte
+    // Any other TimeoutException get converted to a RpcTimeoutException with modified message
+    case te: TimeoutException => throw createRpcTimeoutException(te)
+  }
+
+  /**
+   * Wait for the completed result and return it. If the result is not available within this
+   * timeout, throw a [[RpcTimeoutException]] to indicate which configuration controls the timeout.
+   * @param  awaitable  the `Awaitable` to be awaited
+   * @throws RpcTimeoutException if after waiting for the specified time `awaitable`
+   *         is still not ready
+   */
+  def awaitResult[T](awaitable: Awaitable[T]): T = {
+    try {
+      Await.result(awaitable, duration)
+    } catch addMessageIfTimeout
+  }
+}
+
+
+private[spark] object RpcTimeout {
+
+  /**
+   * Lookup the timeout property in the configuration and create
+   * a RpcTimeout with the property key in the description.
+   * @param conf configuration properties containing the timeout
+   * @param timeoutProp property key for the timeout in seconds
+   * @throws NoSuchElementException if property is not set
+   */
+  def apply(conf: SparkConf, timeoutProp: String): RpcTimeout = {
+    val timeout = { conf.getTimeAsSeconds(timeoutProp).seconds }
+    new RpcTimeout(timeout, timeoutProp)
+  }
+
+  /**
+   * Lookup the timeout property in the configuration and create
+   * a RpcTimeout with the property key in the description.
+   * Uses the given default value if property is not set
+   * @param conf configuration properties containing the timeout
+   * @param timeoutProp property key for the timeout in seconds
+   * @param defaultValue default timeout value in seconds if property not found
+   */
+  def apply(conf: SparkConf, timeoutProp: String, defaultValue: String): RpcTimeout = {
+    val timeout = { conf.getTimeAsSeconds(timeoutProp, defaultValue).seconds }
+    new RpcTimeout(timeout, timeoutProp)
+  }
+
+  /**
+   * Lookup prioritized list of timeout properties in the configuration
+   * and create a RpcTimeout with the first set property key in the
+   * description.
+   * Uses the given default value if property is not set
+   * @param conf configuration properties containing the timeout
+   * @param timeoutPropList prioritized list of property keys for the timeout in seconds
+   * @param defaultValue default timeout value in seconds if no properties found
+   */
+  def apply(conf: SparkConf, timeoutPropList: Seq[String], defaultValue: String): RpcTimeout = {
+    require(timeoutPropList.nonEmpty)
+
+    // Find the first set property or use the default value with the first property
+    val itr = timeoutPropList.iterator
+    var foundProp: Option[(String, String)] = None
+    while (itr.hasNext && foundProp.isEmpty){
+      val propKey = itr.next()
+      conf.getOption(propKey).foreach { prop => foundProp = Some(propKey, prop) }
+    }
+    val finalProp = foundProp.getOrElse(timeoutPropList.head, defaultValue)
+    val timeout = { Utils.timeStringAsSeconds(finalProp._2).seconds }
+    new RpcTimeout(timeout, finalProp._1)
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
index 95132a4e4a0bf..3fad595a0d0b0 100644
--- a/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/akka/AkkaRpcEnv.scala
@@ -39,10 +39,6 @@ import org.apache.spark.util.{ActorLogReceive, AkkaUtils, ThreadUtils}
  *
  * TODO Once we remove all usages of Akka in other place, we can move this file to a new project and
  * remove Akka from the dependencies.
- *
- * @param actorSystem
- * @param conf
- * @param boundPort
  */
 private[spark] class AkkaRpcEnv private[akka] (
     val actorSystem: ActorSystem, conf: SparkConf, boundPort: Int)
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index d71e6f01dbb29..398e9eafc1444 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.rpc.netty
 
-import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit}
+import java.util.concurrent.{ThreadPoolExecutor, ConcurrentHashMap, LinkedBlockingQueue, TimeUnit}
 import javax.annotation.concurrent.GuardedBy
 
 import scala.collection.JavaConverters._
@@ -38,12 +38,16 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val inbox = new Inbox(ref, endpoint)
   }
 
-  private val endpoints = new ConcurrentHashMap[String, EndpointData]()
-  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]()
+  private val endpoints = new ConcurrentHashMap[String, EndpointData]
+  private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
 
   // Track the receivers whose inboxes may contain messages.
   private val receivers = new LinkedBlockingQueue[EndpointData]()
 
+  /**
+   * True if the dispatcher has been stopped. Once stopped, all messages posted will be bounced
+   * immediately.
+   */
   @GuardedBy("this")
   private var stopped = false
 
@@ -59,7 +63,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
       }
       val data = endpoints.get(name)
       endpointRefs.put(data.endpoint, data.ref)
-      receivers.put(data)
+      receivers.put(data)  // for the OnStart message
     }
     endpointRef
   }
@@ -73,7 +77,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     val data = endpoints.remove(name)
     if (data != null) {
       data.inbox.stop()
-      receivers.put(data)
+      receivers.put(data)  // for the OnStop message
     }
     // Don't clean `endpointRefs` here because it's possible that some messages are being processed
     // now and they can use `getRpcEndpointRef`. So `endpointRefs` will be cleaned in Inbox via
@@ -91,19 +95,23 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   }
 
   /**
-   * Send a message to all registered [[RpcEndpoint]]s.
-   * @param message
+   * Send a message to all registered [[RpcEndpoint]]s in this process.
+   *
+   * This can be used to make network events known to all end points (e.g. "a new node connected").
    */
-  def broadcastMessage(message: InboxMessage): Unit = {
+  def postToAll(message: InboxMessage): Unit = {
     val iter = endpoints.keySet().iterator()
     while (iter.hasNext) {
       val name = iter.next
-      postMessageToInbox(name, (_) => message,
-        () => { logWarning(s"Drop ${message} because ${name} has been stopped") })
+      postMessage(
+        name,
+        _ => message,
+        () => { logWarning(s"Drop $message because $name has been stopped") })
     }
   }
 
-  def postMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = {
+  /** Posts a message sent by a remote endpoint. */
+  def postRemoteMessage(message: RequestMessage, callback: RpcResponseCallback): Unit = {
     def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
       val rpcCallContext =
         new RemoteNettyRpcCallContext(
@@ -116,10 +124,11 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
         new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
     }
 
-    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+    postMessage(message.receiver.name, createMessage, onEndpointStopped)
   }
 
-  def postMessage(message: RequestMessage, p: Promise[Any]): Unit = {
+  /** Posts a message sent by a local endpoint. */
+  def postLocalMessage(message: RequestMessage, p: Promise[Any]): Unit = {
     def createMessage(sender: NettyRpcEndpointRef): InboxMessage = {
       val rpcCallContext =
         new LocalNettyRpcCallContext(sender, message.senderAddress, message.needReply, p)
@@ -131,39 +140,36 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
         new SparkException(s"Could not find ${message.receiver.name} or it has been stopped"))
     }
 
-    postMessageToInbox(message.receiver.name, createMessage, onEndpointStopped)
+    postMessage(message.receiver.name, createMessage, onEndpointStopped)
   }
 
-  private def postMessageToInbox(
+  /**
+   * Posts a message to a specific endpoint.
+   *
+   * @param endpointName name of the endpoint.
+   * @param createMessageFn function to create the message.
+   * @param callbackIfStopped callback function if the endpoint is stopped.
+   */
+  private def postMessage(
       endpointName: String,
       createMessageFn: NettyRpcEndpointRef => InboxMessage,
-      onStopped: () => Unit): Unit = {
-    val shouldCallOnStop =
-      synchronized {
-        val data = endpoints.get(endpointName)
-        if (stopped || data == null) {
-          true
-        } else {
-          data.inbox.post(createMessageFn(data.ref))
-          receivers.put(data)
-          false
-        }
+      callbackIfStopped: () => Unit): Unit = {
+    val shouldCallOnStop = synchronized {
+      val data = endpoints.get(endpointName)
+      if (stopped || data == null) {
+        true
+      } else {
+        data.inbox.post(createMessageFn(data.ref))
+        receivers.put(data)
+        false
       }
+    }
     if (shouldCallOnStop) {
       // We don't need to call `onStop` in the `synchronized` block
-      onStopped()
+      callbackIfStopped()
     }
   }
 
-  private val parallelism = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.parallelism",
-    Runtime.getRuntime.availableProcessors())
-
-  private val executor = ThreadUtils.newDaemonFixedThreadPool(parallelism, "dispatcher-event-loop")
-
-  (0 until parallelism) foreach { _ =>
-    executor.execute(new MessageLoop)
-  }
-
   def stop(): Unit = {
     synchronized {
       if (stopped) {
@@ -174,12 +180,12 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     // Stop all endpoints. This will queue all endpoints for processing by the message loops.
     endpoints.keySet().asScala.foreach(unregisterRpcEndpoint)
     // Enqueue a message that tells the message loops to stop.
-    receivers.put(PoisonEndpoint)
-    executor.shutdown()
+    receivers.put(PoisonPill)
+    threadpool.shutdown()
   }
 
   def awaitTermination(): Unit = {
-    executor.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS)
+    threadpool.awaitTermination(Long.MaxValue, TimeUnit.MILLISECONDS)
   }
 
   /**
@@ -189,15 +195,27 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     endpoints.containsKey(name)
   }
 
+  /** Thread pool used for dispatching messages. */
+  private val threadpool: ThreadPoolExecutor = {
+    val numThreads = nettyEnv.conf.getInt("spark.rpc.netty.dispatcher.numThreads",
+      Runtime.getRuntime.availableProcessors())
+    val pool = ThreadUtils.newDaemonFixedThreadPool(numThreads, "dispatcher-event-loop")
+    for (i <- 0 until numThreads) {
+      pool.execute(new MessageLoop)
+    }
+    pool
+  }
+
+  /** Message loop used for dispatching messages. */
   private class MessageLoop extends Runnable {
     override def run(): Unit = {
       try {
         while (true) {
           try {
             val data = receivers.take()
-            if (data == PoisonEndpoint) {
-              // Put PoisonEndpoint back so that other MessageLoops can see it.
-              receivers.put(PoisonEndpoint)
+            if (data == PoisonPill) {
+              // Put PoisonPill back so that other MessageLoops can see it.
+              receivers.put(PoisonPill)
               return
             }
             data.inbox.process(Dispatcher.this)
@@ -211,8 +229,6 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
     }
   }
 
-  /**
-   * A poison endpoint that indicates MessageLoop should exit its loop.
-   */
-  private val PoisonEndpoint = new EndpointData(null, null, null)
+  /** A poison endpoint that indicates MessageLoop should exit its message loop. */
+  private val PoisonPill = new EndpointData(null, null, null)
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
index 6061c9b8de944..fa9a3eb99b02a 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
@@ -26,8 +26,8 @@ private[netty] case class ID(name: String)
 /**
  * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if a [[RpcEndpoint]] exists in this [[RpcEnv]]
  */
-private[netty] class IDVerifier(
-    override val rpcEnv: RpcEnv, dispatcher: Dispatcher) extends RpcEndpoint {
+private[netty] class IDVerifier(override val rpcEnv: RpcEnv, dispatcher: Dispatcher)
+  extends RpcEndpoint {
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
     case ID(name) => context.reply(dispatcher.verify(name))
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
index b669f59a2884e..c72b588db57fe 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Inbox.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.rpc.netty
 
-import java.util.LinkedList
 import javax.annotation.concurrent.GuardedBy
 
 import scala.util.control.NonFatal
 
+import com.google.common.annotations.VisibleForTesting
+
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.rpc.{RpcAddress, RpcEndpoint, ThreadSafeRpcEndpoint}
 
+
 private[netty] sealed trait InboxMessage
 
 private[netty] case class ContentMessage(
@@ -37,44 +39,40 @@ private[netty] case object OnStart extends InboxMessage
 
 private[netty] case object OnStop extends InboxMessage
 
-/**
- * A broadcast message that indicates connecting to a remote node.
- */
-private[netty] case class Associated(remoteAddress: RpcAddress) extends InboxMessage
+/** A message to tell all endpoints that a remote process has connected. */
+private[netty] case class RemoteProcessConnected(remoteAddress: RpcAddress) extends InboxMessage
 
-/**
- * A broadcast message that indicates a remote connection is lost.
- */
-private[netty] case class Disassociated(remoteAddress: RpcAddress) extends InboxMessage
+/** A message to tell all endpoints that a remote process has disconnected. */
+private[netty] case class RemoteProcessDisconnected(remoteAddress: RpcAddress) extends InboxMessage
 
-/**
- * A broadcast message that indicates a network error
- */
-private[netty] case class AssociationError(cause: Throwable, remoteAddress: RpcAddress)
+/** A message to tell all endpoints that a network error has happened. */
+private[netty] case class RemoteProcessConnectionError(cause: Throwable, remoteAddress: RpcAddress)
   extends InboxMessage
 
 /**
  * A inbox that stores messages for an [[RpcEndpoint]] and posts messages to it thread-safely.
- * @param endpointRef
- * @param endpoint
  */
 private[netty] class Inbox(
     val endpointRef: NettyRpcEndpointRef,
-    val endpoint: RpcEndpoint) extends Logging {
+    val endpoint: RpcEndpoint)
+  extends Logging {
 
-  inbox =>
+  inbox =>  // Give this an alias so we can use it more clearly in closures.
 
   @GuardedBy("this")
-  protected val messages = new LinkedList[InboxMessage]()
+  protected val messages = new java.util.LinkedList[InboxMessage]()
 
+  /** True if the inbox (and its associated endpoint) is stopped. */
   @GuardedBy("this")
   private var stopped = false
 
+  /** Allow multiple threads to process messages at the same time. */
   @GuardedBy("this")
   private var enableConcurrent = false
 
+  /** The number of threads processing messages for this inbox. */
   @GuardedBy("this")
-  private var workerCount = 0
+  private var numActiveThreads = 0
 
   // OnStart should be the first message to process
   inbox.synchronized {
@@ -87,12 +85,12 @@ private[netty] class Inbox(
   def process(dispatcher: Dispatcher): Unit = {
     var message: InboxMessage = null
     inbox.synchronized {
-      if (!enableConcurrent && workerCount != 0) {
+      if (!enableConcurrent && numActiveThreads != 0) {
         return
       }
       message = messages.poll()
       if (message != null) {
-        workerCount += 1
+        numActiveThreads += 1
       } else {
         return
       }
@@ -101,15 +99,11 @@ private[netty] class Inbox(
       safelyCall(endpoint) {
         message match {
           case ContentMessage(_sender, content, needReply, context) =>
-            val pf: PartialFunction[Any, Unit] =
-              if (needReply) {
-                endpoint.receiveAndReply(context)
-              } else {
-                endpoint.receive
-              }
+            // The partial function to call
+            val pf = if (needReply) endpoint.receiveAndReply(context) else endpoint.receive
             try {
               pf.applyOrElse[Any, Unit](content, { msg =>
-                throw new SparkException(s"Unmatched message $message from ${_sender}")
+                throw new SparkException(s"Unsupported message $message from ${_sender}")
               })
               if (!needReply) {
                 context.finish()
@@ -121,11 +115,13 @@ private[netty] class Inbox(
                   context.sendFailure(e)
                 } else {
                   context.finish()
-                  throw e
                 }
+                // Throw the exception -- this exception will be caught by the safelyCall function.
+                // The endpoint's onError function will be called.
+                throw e
             }
 
-          case OnStart => {
+          case OnStart =>
             endpoint.onStart()
             if (!endpoint.isInstanceOf[ThreadSafeRpcEndpoint]) {
               inbox.synchronized {
@@ -134,24 +130,22 @@ private[netty] class Inbox(
                 }
               }
             }
-          }
 
           case OnStop =>
-            val _workCount = inbox.synchronized {
-              workerCount
-            }
-            assert(_workCount == 1, s"There should be only one worker but was ${_workCount}")
+            val activeThreads = inbox.synchronized { inbox.numActiveThreads }
+            assert(activeThreads == 1,
+              s"There should be only a single active thread but found $activeThreads threads.")
             dispatcher.removeRpcEndpointRef(endpoint)
             endpoint.onStop()
             assert(isEmpty, "OnStop should be the last message")
 
-          case Associated(remoteAddress) =>
+          case RemoteProcessConnected(remoteAddress) =>
             endpoint.onConnected(remoteAddress)
 
-          case Disassociated(remoteAddress) =>
+          case RemoteProcessDisconnected(remoteAddress) =>
             endpoint.onDisconnected(remoteAddress)
 
-          case AssociationError(cause, remoteAddress) =>
+          case RemoteProcessConnectionError(cause, remoteAddress) =>
             endpoint.onNetworkError(cause, remoteAddress)
         }
       }
@@ -159,33 +153,27 @@ private[netty] class Inbox(
       inbox.synchronized {
         // "enableConcurrent" will be set to false after `onStop` is called, so we should check it
         // every time.
-        if (!enableConcurrent && workerCount != 1) {
+        if (!enableConcurrent && numActiveThreads != 1) {
           // If we are not the only one worker, exit
-          workerCount -= 1
+          numActiveThreads -= 1
           return
         }
         message = messages.poll()
         if (message == null) {
-          workerCount -= 1
+          numActiveThreads -= 1
           return
         }
       }
     }
   }
 
-  def post(message: InboxMessage): Unit = {
-    val dropped =
-      inbox.synchronized {
-        if (stopped) {
-          // We already put "OnStop" into "messages", so we should drop further messages
-          true
-        } else {
-          messages.add(message)
-          false
-        }
-      }
-    if (dropped) {
+  def post(message: InboxMessage): Unit = inbox.synchronized {
+    if (stopped) {
+      // We already put "OnStop" into "messages", so we should drop further messages
       onDrop(message)
+    } else {
+      messages.add(message)
+      false
     }
   }
 
@@ -203,24 +191,23 @@ private[netty] class Inbox(
     }
   }
 
-  // Visible for testing.
+  def isEmpty: Boolean = inbox.synchronized { messages.isEmpty }
+
+  /** Called when we are dropping a message. Test cases override this to test message dropping. */
+  @VisibleForTesting
   protected def onDrop(message: InboxMessage): Unit = {
-    logWarning(s"Drop ${message} because $endpointRef is stopped")
+    logWarning(s"Drop $message because $endpointRef is stopped")
   }
 
-  def isEmpty: Boolean = inbox.synchronized { messages.isEmpty }
-
+  /**
+   * Calls action closure, and calls the endpoint's onError function in the case of exceptions.
+   */
   private def safelyCall(endpoint: RpcEndpoint)(action: => Unit): Unit = {
-    try {
-      action
-    } catch {
-      case NonFatal(e) => {
-        try {
-          endpoint.onError(e)
-        } catch {
-          case NonFatal(e) => logWarning(s"Ignore error", e)
+    try action catch {
+      case NonFatal(e) =>
+        try endpoint.onError(e) catch {
+          case NonFatal(ee) => logError(s"Ignoring error", ee)
         }
-      }
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
index 75dcc02a0c5a9..21d5bb4923d1b 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcCallContext.scala
@@ -26,7 +26,8 @@ import org.apache.spark.rpc.{RpcAddress, RpcCallContext}
 private[netty] abstract class NettyRpcCallContext(
     endpointRef: NettyRpcEndpointRef,
     override val senderAddress: RpcAddress,
-    needReply: Boolean) extends RpcCallContext with Logging {
+    needReply: Boolean)
+  extends RpcCallContext with Logging {
 
   protected def send(message: Any): Unit
 
@@ -35,7 +36,7 @@ private[netty] abstract class NettyRpcCallContext(
       send(AskResponse(endpointRef, response))
     } else {
       throw new IllegalStateException(
-        s"Cannot send $response to the sender because the sender won't handle it")
+        s"Cannot send $response to the sender because the sender does not expect a reply")
     }
   }
 
@@ -63,7 +64,8 @@ private[netty] class LocalNettyRpcCallContext(
     endpointRef: NettyRpcEndpointRef,
     senderAddress: RpcAddress,
     needReply: Boolean,
-    p: Promise[Any]) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+    p: Promise[Any])
+  extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
 
   override protected def send(message: Any): Unit = {
     p.success(message)
@@ -78,7 +80,8 @@ private[netty] class RemoteNettyRpcCallContext(
     endpointRef: NettyRpcEndpointRef,
     callback: RpcResponseCallback,
     senderAddress: RpcAddress,
-    needReply: Boolean) extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
+    needReply: Boolean)
+  extends NettyRpcCallContext(endpointRef, senderAddress, needReply) {
 
   override protected def send(message: Any): Unit = {
     val reply = nettyEnv.serialize(message)
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 5522b40782d9e..89b6df76c2707 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -19,7 +19,6 @@ package org.apache.spark.rpc.netty
 import java.io._
 import java.net.{InetSocketAddress, URI}
 import java.nio.ByteBuffer
-import java.util.Arrays
 import java.util.concurrent._
 import javax.annotation.concurrent.GuardedBy
 
@@ -77,19 +76,19 @@ private[netty] class NettyRpcEnv(
   @volatile private var server: TransportServer = _
 
   def start(port: Int): Unit = {
-    val bootstraps: Seq[TransportServerBootstrap] =
+    val bootstraps: java.util.List[TransportServerBootstrap] =
       if (securityManager.isAuthenticationEnabled()) {
-        Seq(new SaslServerBootstrap(transportConf, securityManager))
+        java.util.Arrays.asList(new SaslServerBootstrap(transportConf, securityManager))
       } else {
-        Nil
+        java.util.Collections.emptyList()
       }
-    server = transportContext.createServer(port, bootstraps.asJava)
+    server = transportContext.createServer(port, bootstraps)
     dispatcher.registerRpcEndpoint(IDVerifier.NAME, new IDVerifier(this, dispatcher))
   }
 
   override lazy val address: RpcAddress = {
     require(server != null, "NettyRpcEnv has not yet started")
-    RpcAddress(host, server.getPort())
+    RpcAddress(host, server.getPort)
   }
 
   override def setupEndpoint(name: String, endpoint: RpcEndpoint): RpcEndpointRef = {
@@ -119,7 +118,7 @@ private[netty] class NettyRpcEnv(
     val remoteAddr = message.receiver.address
     if (remoteAddr == address) {
       val promise = Promise[Any]()
-      dispatcher.postMessage(message, promise)
+      dispatcher.postLocalMessage(message, promise)
       promise.future.onComplete {
         case Success(response) =>
           val ack = response.asInstanceOf[Ack]
@@ -148,10 +147,9 @@ private[netty] class NettyRpcEnv(
           }
         })
       } catch {
-        case e: RejectedExecutionException => {
+        case e: RejectedExecutionException =>
           // `send` after shutting clientConnectionExecutor down, ignore it
-          logWarning(s"Cannot send ${message} because RpcEnv is stopped")
-        }
+          logWarning(s"Cannot send $message because RpcEnv is stopped")
       }
     }
   }
@@ -161,7 +159,7 @@ private[netty] class NettyRpcEnv(
     val remoteAddr = message.receiver.address
     if (remoteAddr == address) {
       val p = Promise[Any]()
-      dispatcher.postMessage(message, p)
+      dispatcher.postLocalMessage(message, p)
       p.future.onComplete {
         case Success(response) =>
           val reply = response.asInstanceOf[AskResponse]
@@ -218,7 +216,7 @@ private[netty] class NettyRpcEnv(
 
   private[netty] def serialize(content: Any): Array[Byte] = {
     val buffer = javaSerializerInstance.serialize(content)
-    Arrays.copyOfRange(
+    java.util.Arrays.copyOfRange(
       buffer.array(), buffer.arrayOffset + buffer.position, buffer.arrayOffset + buffer.limit)
   }
 
@@ -425,7 +423,7 @@ private[netty] class NettyRpcHandler(
     assert(addr != null)
     val remoteEnvAddress = requestMessage.senderAddress
     val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-    val broadcastMessage =
+    val broadcastMessage: Option[RemoteProcessConnected] =
       synchronized {
         // If the first connection to a remote RpcEnv is found, we should broadcast "Associated"
         if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
@@ -435,7 +433,7 @@ private[netty] class NettyRpcHandler(
           remoteConnectionCount.put(remoteEnvAddress, count + 1)
           if (count == 0) {
             // This is the first connection, so fire "Associated"
-            Some(Associated(remoteEnvAddress))
+            Some(RemoteProcessConnected(remoteEnvAddress))
           } else {
             None
           }
@@ -443,8 +441,8 @@ private[netty] class NettyRpcHandler(
           None
         }
       }
-    broadcastMessage.foreach(dispatcher.broadcastMessage)
-    dispatcher.postMessage(requestMessage, callback)
+    broadcastMessage.foreach(dispatcher.postToAll)
+    dispatcher.postRemoteMessage(requestMessage, callback)
   }
 
   override def getStreamManager: StreamManager = new OneForOneStreamManager
@@ -455,12 +453,12 @@ private[netty] class NettyRpcHandler(
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
       val broadcastMessage =
         synchronized {
-          remoteAddresses.get(clientAddr).map(AssociationError(cause, _))
+          remoteAddresses.get(clientAddr).map(RemoteProcessConnectionError(cause, _))
         }
       if (broadcastMessage.isEmpty) {
         logError(cause.getMessage, cause)
       } else {
-        dispatcher.broadcastMessage(broadcastMessage.get)
+        dispatcher.postToAll(broadcastMessage.get)
       }
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null.
@@ -485,7 +483,7 @@ private[netty] class NettyRpcHandler(
             if (count - 1 == 0) {
               // We lost all clients, so clean up and fire "Disassociated"
               remoteConnectionCount.remove(remoteEnvAddress)
-              Some(Disassociated(remoteEnvAddress))
+              Some(RemoteProcessDisconnected(remoteEnvAddress))
             } else {
               // Decrease the connection number of remoteEnvAddress
               remoteConnectionCount.put(remoteEnvAddress, count - 1)
@@ -493,7 +491,7 @@ private[netty] class NettyRpcHandler(
             }
           }
         }
-      broadcastMessage.foreach(dispatcher.broadcastMessage)
+      broadcastMessage.foreach(dispatcher.postToAll)
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null. In this case,
       // we can ignore it since we don't fire "Associated".
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
index 1ed098379e299..15e7519d708c6 100644
--- a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -15,7 +15,6 @@
  * limitations under the License.
  */
 
-
 package org.apache.spark.util
 
 import java.util.concurrent._
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index e60c1b355a73e..bd7e51c3b5100 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -1895,6 +1895,7 @@ private[spark] object Utils extends Logging {
    *                     This is expected to throw java.net.BindException on port collision.
    * @param conf A SparkConf used to get the maximum number of retries when binding to a port.
    * @param serviceName Name of the service.
+   * @return (service: T, port: Int)
    */
   def startServiceOnPort[T](
       startPort: Int,
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
index 120cf1b6fa9dc..276c077b3d13e 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/InboxSuite.scala
@@ -113,7 +113,7 @@ class InboxSuite extends SparkFunSuite {
     val remoteAddress = RpcAddress("localhost", 11111)
 
     val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(Associated(remoteAddress))
+    inbox.post(RemoteProcessConnected(remoteAddress))
     inbox.process(dispatcher)
 
     endpoint.verifySingleOnConnectedMessage(remoteAddress)
@@ -127,7 +127,7 @@ class InboxSuite extends SparkFunSuite {
     val remoteAddress = RpcAddress("localhost", 11111)
 
     val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(Disassociated(remoteAddress))
+    inbox.post(RemoteProcessDisconnected(remoteAddress))
     inbox.process(dispatcher)
 
     endpoint.verifySingleOnDisconnectedMessage(remoteAddress)
@@ -142,7 +142,7 @@ class InboxSuite extends SparkFunSuite {
     val cause = new RuntimeException("Oops")
 
     val inbox = new Inbox(endpointRef, endpoint)
-    inbox.post(AssociationError(cause, remoteAddress))
+    inbox.post(RemoteProcessConnectionError(cause, remoteAddress))
     inbox.process(dispatcher)
 
     endpoint.verifySingleOnNetworkErrorMessage(cause, remoteAddress)
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
index 06ca035d199e8..f24f78b8c4542 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
@@ -45,7 +45,7 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40001))
     nettyRpcHandler.receive(client, null, null)
 
-    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
   }
 
   test("connectionTerminated") {
@@ -60,8 +60,9 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
     nettyRpcHandler.connectionTerminated(client)
 
-    verify(dispatcher, times(1)).broadcastMessage(Associated(RpcAddress("localhost", 12345)))
-    verify(dispatcher, times(1)).broadcastMessage(Disassociated(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
+    verify(dispatcher, times(1)).postToAll(
+      RemoteProcessDisconnected(RpcAddress("localhost", 12345)))
   }
 
 }

From d0cc79ccd0b4500bd6b18184a723dabc164e8abd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 13 Oct 2015 09:57:53 -0700
Subject: [PATCH 033/139] [SPARK-11030] [SQL] share the SQLTab across sessions

The SQLTab will be shared by multiple sessions.

If we create multiple independent SQLContexts (not using newSession()), will still see multiple SQLTabs in the Spark UI.

Author: Davies Liu <davies@databricks.com>

Closes #9048 from davies/sqlui.
---
 .../org/apache/spark/sql/SQLContext.scala     | 23 +++++++++++++------
 .../spark/sql/execution/ui/SQLListener.scala  | 10 +++-----
 .../spark/sql/execution/ui/SQLTab.scala       |  4 +---
 .../sql/execution/ui/SQLListenerSuite.scala   |  8 +++----
 .../apache/spark/sql/hive/HiveContext.scala   | 12 +++++++---
 5 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 1bd291389241a..cd937257d31a8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -65,12 +65,15 @@ import org.apache.spark.util.Utils
 class SQLContext private[sql](
     @transient val sparkContext: SparkContext,
     @transient protected[sql] val cacheManager: CacheManager,
+    @transient private[sql] val listener: SQLListener,
     val isRootContext: Boolean)
   extends org.apache.spark.Logging with Serializable {
 
   self =>
 
-  def this(sparkContext: SparkContext) = this(sparkContext, new CacheManager, true)
+  def this(sparkContext: SparkContext) = {
+    this(sparkContext, new CacheManager, SQLContext.createListenerAndUI(sparkContext), true)
+  }
   def this(sparkContext: JavaSparkContext) = this(sparkContext.sc)
 
   // If spark.sql.allowMultipleContexts is true, we will throw an exception if a user
@@ -97,7 +100,7 @@ class SQLContext private[sql](
 
   /**
    * Returns a SQLContext as new session, with separated SQL configurations, temporary tables,
-   * registered functions, but sharing the same SparkContext and CacheManager.
+   * registered functions, but sharing the same SparkContext, CacheManager, SQLListener and SQLTab.
    *
    * @since 1.6.0
    */
@@ -105,6 +108,7 @@ class SQLContext private[sql](
     new SQLContext(
       sparkContext = sparkContext,
       cacheManager = cacheManager,
+      listener = listener,
       isRootContext = false)
   }
 
@@ -113,11 +117,6 @@ class SQLContext private[sql](
    */
   protected[sql] lazy val conf = new SQLConf
 
-  // `listener` should be only used in the driver
-  @transient private[sql] val listener = new SQLListener(this)
-  sparkContext.addSparkListener(listener)
-  sparkContext.ui.foreach(new SQLTab(this, _))
-
   /**
    * Set Spark SQL configuration properties.
    *
@@ -1312,4 +1311,14 @@ object SQLContext {
       ): InternalRow
     }
   }
+
+  /**
+   * Create a SQLListener then add it into SparkContext, and create an SQLTab if there is SparkUI.
+   */
+  private[sql] def createListenerAndUI(sc: SparkContext): SQLListener = {
+    val listener = new SQLListener(sc.conf)
+    sc.addSparkListener(listener)
+    sc.ui.foreach(new SQLTab(listener, _))
+    listener
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index 5779c71f64e9e..d6472400a6a21 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -19,19 +19,15 @@ package org.apache.spark.sql.execution.ui
 
 import scala.collection.mutable
 
-import com.google.common.annotations.VisibleForTesting
-
-import org.apache.spark.{JobExecutionStatus, Logging}
 import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.scheduler._
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.execution.SQLExecution
 import org.apache.spark.sql.execution.metric.{SQLMetricParam, SQLMetricValue}
+import org.apache.spark.{JobExecutionStatus, Logging, SparkConf}
 
-private[sql] class SQLListener(sqlContext: SQLContext) extends SparkListener with Logging {
+private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Logging {
 
-  private val retainedExecutions =
-    sqlContext.sparkContext.conf.getInt("spark.sql.ui.retainedExecutions", 1000)
+  private val retainedExecutions = conf.getInt("spark.sql.ui.retainedExecutions", 1000)
 
   private val activeExecutions = mutable.HashMap[Long, SQLExecutionUIData]()
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala
index 0b0867f67eb6e..9c27944d42fc6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLTab.scala
@@ -20,14 +20,12 @@ package org.apache.spark.sql.execution.ui
 import java.util.concurrent.atomic.AtomicInteger
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.SQLContext
 import org.apache.spark.ui.{SparkUI, SparkUITab}
 
-private[sql] class SQLTab(sqlContext: SQLContext, sparkUI: SparkUI)
+private[sql] class SQLTab(val listener: SQLListener, sparkUI: SparkUI)
   extends SparkUITab(sparkUI, SQLTab.nextTabName) with Logging {
 
   val parent = sparkUI
-  val listener = sqlContext.listener
 
   attachPage(new AllExecutionsPage(this))
   attachPage(new ExecutionPage(this))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 7a46c69a056b1..727cf3665a871 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -74,7 +74,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("basic") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     val accumulatorIds =
@@ -212,7 +212,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobSucceeded)") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -241,7 +241,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before multiple onJobEnd(JobSucceeded)s") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
@@ -281,7 +281,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("onExecutionEnd happens before onJobEnd(JobFailed)") {
-    val listener = new SQLListener(sqlContext)
+    val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
     listener.onExecutionStart(
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index ddeadd3eb737d..e620d7fb82af9 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -40,12 +40,13 @@ import org.apache.spark.api.java.JavaSparkContext
 import org.apache.spark.sql.SQLConf.SQLConfEntry
 import org.apache.spark.sql.SQLConf.SQLConfEntry._
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression}
-import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, SqlParser}
 import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
+import org.apache.spark.sql.execution.ui.SQLListener
 import org.apache.spark.sql.execution.{CacheManager, ExecutedCommand, ExtractPythonUDFs, SetCommand}
 import org.apache.spark.sql.hive.client._
 import org.apache.spark.sql.hive.execution.{DescribeHiveTableCommand, HiveNativeCommand}
@@ -88,12 +89,16 @@ private[hive] case class CurrentDatabase(ctx: HiveContext)
 class HiveContext private[hive](
     sc: SparkContext,
     cacheManager: CacheManager,
+    @transient listener: SQLListener,
     @transient execHive: ClientWrapper,
     @transient metaHive: ClientInterface,
-    isRootContext: Boolean) extends SQLContext(sc, cacheManager, isRootContext) with Logging {
+    isRootContext: Boolean)
+  extends SQLContext(sc, cacheManager, listener, isRootContext) with Logging {
   self =>
 
-  def this(sc: SparkContext) = this(sc, new CacheManager, null, null, true)
+  def this(sc: SparkContext) = {
+    this(sc, new CacheManager, SQLContext.createListenerAndUI(sc), null, null, true)
+  }
   def this(sc: JavaSparkContext) = this(sc.sc)
 
   import org.apache.spark.sql.hive.HiveContext._
@@ -109,6 +114,7 @@ class HiveContext private[hive](
     new HiveContext(
       sc = sc,
       cacheManager = cacheManager,
+      listener = listener,
       execHive = executionHive.newSession(),
       metaHive = metadataHive.newSession(),
       isRootContext = false)

From 5e3868ba139f5f0b3a33361c6b884594a3ab6421 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Tue, 13 Oct 2015 10:02:21 -0700
Subject: [PATCH 034/139] [SPARK-10051] [SPARKR] Support collecting data of
 StructType in DataFrame

Two points in this PR:

1.    Originally thought was that a named R list is assumed to be a struct in SerDe. But this is problematic because some R functions will implicitly generate named lists that are not intended to be a struct when transferred by SerDe. So SerDe clients have to explicitly mark a names list as struct by changing its class from "list" to "struct".

2.    SerDe is in the Spark Core module, and data of StructType is represented as GenricRow which is defined in Spark SQL module. SerDe can't import GenricRow as in maven build  Spark SQL module depends on Spark Core module. So this PR adds a registration hook in SerDe to allow SQLUtils in Spark SQL module to register its functions for serialization and deserialization of StructType.

Author: Sun Rui <rui.sun@intel.com>

Closes #8794 from sun-rui/SPARK-10051.
---
 R/pkg/R/SQLContext.R                          | 22 +++---
 R/pkg/R/deserialize.R                         | 10 +++
 R/pkg/R/schema.R                              | 28 +++++++-
 R/pkg/R/serialize.R                           | 43 +++++++----
 R/pkg/R/sparkR.R                              |  4 +-
 R/pkg/R/utils.R                               | 17 +++++
 R/pkg/inst/tests/test_sparkSQL.R              | 51 +++++++------
 .../scala/org/apache/spark/api/r/SerDe.scala  | 71 ++++++++++++++-----
 .../org/apache/spark/sql/api/r/SQLUtils.scala | 47 ++++++++++--
 9 files changed, 224 insertions(+), 69 deletions(-)

diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 1c58fd96d750a..66c7e307212c3 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -32,6 +32,7 @@ infer_type <- function(x) {
                  numeric = "double",
                  raw = "binary",
                  list = "array",
+                 struct = "struct",
                  environment = "map",
                  Date = "date",
                  POSIXlt = "timestamp",
@@ -44,17 +45,18 @@ infer_type <- function(x) {
     paste0("map<string,", infer_type(get(key, x)), ">")
   } else if (type == "array") {
     stopifnot(length(x) > 0)
+
+    paste0("array<", infer_type(x[[1]]), ">")
+  } else if (type == "struct") {
+    stopifnot(length(x) > 0)
     names <- names(x)
-    if (is.null(names)) {
-      paste0("array<", infer_type(x[[1]]), ">")
-    } else {
-      # StructType
-      types <- lapply(x, infer_type)
-      fields <- lapply(1:length(x), function(i) {
-        structField(names[[i]], types[[i]], TRUE)
-      })
-      do.call(structType, fields)
-    }
+    stopifnot(!is.null(names))
+
+    type <- lapply(seq_along(x), function(i) {
+      paste0(names[[i]], ":", infer_type(x[[i]]), ",")
+    })
+    type <- Reduce(paste0, type)
+    type <- paste0("struct<", substr(type, 1, nchar(type) - 1), ">")
   } else if (length(x) > 1) {
     paste0("array<", infer_type(x[[1]]), ">")
   } else {
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index ce88d0b071b72..f7e56e43016ea 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -51,6 +51,7 @@ readTypedObject <- function(con, type) {
     "a" = readArray(con),
     "l" = readList(con),
     "e" = readEnv(con),
+    "s" = readStruct(con),
     "n" = NULL,
     "j" = getJobj(readString(con)),
     stop(paste("Unsupported type for deserialization", type)))
@@ -135,6 +136,15 @@ readEnv <- function(con) {
   env
 }
 
+# Read a field of StructType from DataFrame
+# into a named list in R whose class is "struct"
+readStruct <- function(con) {
+  names <- readObject(con)
+  fields <- readObject(con)
+  names(fields) <- names
+  listToStruct(fields)
+}
+
 readRaw <- function(con) {
   dataLen <- readInt(con)
   readBin(con, raw(), as.integer(dataLen), endian = "big")
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index 8df1563f8ebc0..6f0e9a94e9bfa 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -136,7 +136,7 @@ checkType <- function(type) {
     switch (firstChar,
             a = {
               # Array type
-              m <- regexec("^array<(.*)>$", type)
+              m <- regexec("^array<(.+)>$", type)
               matchedStrings <- regmatches(type, m)
               if (length(matchedStrings[[1]]) >= 2) {
                 elemType <- matchedStrings[[1]][2]
@@ -146,7 +146,7 @@ checkType <- function(type) {
             },
             m = {
               # Map type
-              m <- regexec("^map<(.*),(.*)>$", type)
+              m <- regexec("^map<(.+),(.+)>$", type)
               matchedStrings <- regmatches(type, m)
               if (length(matchedStrings[[1]]) >= 3) {
                 keyType <- matchedStrings[[1]][2]
@@ -157,6 +157,30 @@ checkType <- function(type) {
                 checkType(valueType)
                 return()
               }
+            },
+            s = {
+              # Struct type
+              m <- regexec("^struct<(.+)>$", type)
+              matchedStrings <- regmatches(type, m)
+              if (length(matchedStrings[[1]]) >= 2) {
+                fieldsString <- matchedStrings[[1]][2]
+                # strsplit does not return the final empty string, so check if
+                # the final char is ","
+                if (substr(fieldsString, nchar(fieldsString), nchar(fieldsString)) != ",") {
+                  fields <- strsplit(fieldsString, ",")[[1]]
+                  for (field in fields) {
+                    m <- regexec("^(.+):(.+)$", field)
+                    matchedStrings <- regmatches(field, m)
+                    if (length(matchedStrings[[1]]) >= 3) {
+                      fieldType <- matchedStrings[[1]][3]
+                      checkType(fieldType)
+                    } else {
+                      break
+                    }
+                  }
+                  return()
+                }
+              }
             })
   }
 
diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R
index 91e6b3e5609b5..17082b4e52fcf 100644
--- a/R/pkg/R/serialize.R
+++ b/R/pkg/R/serialize.R
@@ -32,6 +32,21 @@
 # environment -> Map[String, T], where T is a native type
 # jobj -> Object, where jobj is an object created in the backend
 
+getSerdeType <- function(object) {
+  type <- class(object)[[1]]
+  if (type != "list") {
+    type
+  } else {
+    # Check if all elements are of same type
+    elemType <- unique(sapply(object, function(elem) { getSerdeType(elem) }))
+    if (length(elemType) <= 1) {
+      "array"
+    } else {
+      "list"
+    }
+  }
+}
+
 writeObject <- function(con, object, writeType = TRUE) {
   # NOTE: In R vectors have same type as objects. So we don't support
   # passing in vectors as arrays and instead require arrays to be passed
@@ -45,10 +60,12 @@ writeObject <- function(con, object, writeType = TRUE) {
       type <- "NULL"
     }
   }
+
+  serdeType <- getSerdeType(object)
   if (writeType) {
-    writeType(con, type)
+    writeType(con, serdeType)
   }
-  switch(type,
+  switch(serdeType,
          NULL = writeVoid(con),
          integer = writeInt(con, object),
          character = writeString(con, object),
@@ -56,7 +73,9 @@ writeObject <- function(con, object, writeType = TRUE) {
          double = writeDouble(con, object),
          numeric = writeDouble(con, object),
          raw = writeRaw(con, object),
+         array = writeArray(con, object),
          list = writeList(con, object),
+         struct = writeList(con, object),
          jobj = writeJobj(con, object),
          environment = writeEnv(con, object),
          Date = writeDate(con, object),
@@ -110,7 +129,7 @@ writeRowSerialize <- function(outputCon, rows) {
 serializeRow <- function(row) {
   rawObj <- rawConnection(raw(0), "wb")
   on.exit(close(rawObj))
-  writeGenericList(rawObj, row)
+  writeList(rawObj, row)
   rawConnectionValue(rawObj)
 }
 
@@ -128,7 +147,9 @@ writeType <- function(con, class) {
                  double = "d",
                  numeric = "d",
                  raw = "r",
+                 array = "a",
                  list = "l",
+                 struct = "s",
                  jobj = "j",
                  environment = "e",
                  Date = "D",
@@ -139,15 +160,13 @@ writeType <- function(con, class) {
 }
 
 # Used to pass arrays where all the elements are of the same type
-writeList <- function(con, arr) {
-  # All elements should be of same type
-  elemType <- unique(sapply(arr, function(elem) { class(elem) }))
-  stopifnot(length(elemType) <= 1)
-
+writeArray <- function(con, arr) {
   # TODO: Empty lists are given type "character" right now.
   # This may not work if the Java side expects array of any other type.
-  if (length(elemType) == 0) {
+  if (length(arr) == 0) {
     elemType <- class("somestring")
+  } else {
+    elemType <- getSerdeType(arr[[1]])
   }
 
   writeType(con, elemType)
@@ -161,7 +180,7 @@ writeList <- function(con, arr) {
 }
 
 # Used to pass arrays where the elements can be of different types
-writeGenericList <- function(con, list) {
+writeList <- function(con, list) {
   writeInt(con, length(list))
   for (elem in list) {
     writeObject(con, elem)
@@ -174,9 +193,9 @@ writeEnv <- function(con, env) {
 
   writeInt(con, len)
   if (len > 0) {
-    writeList(con, as.list(ls(env)))
+    writeArray(con, as.list(ls(env)))
     vals <- lapply(ls(env), function(x) { env[[x]] })
-    writeGenericList(con, as.list(vals))
+    writeList(con, as.list(vals))
   }
 }
 
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index 3c57a44db257d..cc47110f54732 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -178,7 +178,7 @@ sparkR.init <- function(
   }
 
   nonEmptyJars <- Filter(function(x) { x != "" }, jars)
-  localJarPaths <- sapply(nonEmptyJars,
+  localJarPaths <- lapply(nonEmptyJars,
                           function(j) { utils::URLencode(paste("file:", uriSep, j, sep = "")) })
 
   # Set the start time to identify jobjs
@@ -193,7 +193,7 @@ sparkR.init <- function(
       master,
       appName,
       as.character(sparkHome),
-      as.list(localJarPaths),
+      localJarPaths,
       sparkEnvirMap,
       sparkExecutorEnvMap),
     envir = .sparkREnv
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 69a2bc728f842..94f16c7ac52cc 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -588,3 +588,20 @@ mergePartitions <- function(rdd, zip) {
 
   PipelinedRDD(rdd, partitionFunc)
 }
+
+# Convert a named list to struct so that
+# SerDe won't confuse between a normal named list and struct
+listToStruct <- function(list) {
+  stopifnot(class(list) == "list")
+  stopifnot(!is.null(names(list)))
+  class(list) <- "struct"
+  list
+}
+
+# Convert a struct to a named list
+structToList <- function(struct) {
+  stopifnot(class(list) == "struct")
+
+  class(struct) <- "list"
+  struct
+}
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 3a04edbb4c116..af6efa40fb2f6 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -66,10 +66,7 @@ test_that("infer types and check types", {
   expect_equal(infer_type(as.POSIXlt("2015-03-11 12:13:04.043")), "timestamp")
   expect_equal(infer_type(c(1L, 2L)), "array<integer>")
   expect_equal(infer_type(list(1L, 2L)), "array<integer>")
-  testStruct <- infer_type(list(a = 1L, b = "2"))
-  expect_equal(class(testStruct), "structType")
-  checkStructField(testStruct$fields()[[1]], "a", "IntegerType", TRUE)
-  checkStructField(testStruct$fields()[[2]], "b", "StringType", TRUE)
+  expect_equal(infer_type(listToStruct(list(a = 1L, b = "2"))), "struct<a:integer,b:string>")
   e <- new.env()
   assign("a", 1L, envir = e)
   expect_equal(infer_type(e), "map<string,integer>")
@@ -242,38 +239,36 @@ test_that("create DataFrame with different data types", {
   expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
 })
 
-test_that("create DataFrame with nested array and map", {
-#  e <- new.env()
-#  assign("n", 3L, envir = e)
-#  l <- list(1:10, list("a", "b"), e, list(a="aa", b=3L))
-#  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c", "d"))
-#  expect_equal(dtypes(df), list(c("a", "array<int>"), c("b", "array<string>"),
-#                                c("c", "map<string,int>"), c("d", "struct<a:string,b:int>")))
-#  expect_equal(count(df), 1)
-#  ldf <- collect(df)
-#  expect_equal(ldf[1,], l[[1]])
-
-  #  ArrayType and MapType
+test_that("create DataFrame with complex types", {
   e <- new.env()
   assign("n", 3L, envir = e)
 
-  l <- list(as.list(1:10), list("a", "b"), e)
-  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c"))
+  s <- listToStruct(list(a = "aa", b = 3L))
+
+  l <- list(as.list(1:10), list("a", "b"), e, s)
+  df <- createDataFrame(sqlContext, list(l), c("a", "b", "c", "d"))
   expect_equal(dtypes(df), list(c("a", "array<int>"),
                                 c("b", "array<string>"),
-                                c("c", "map<string,int>")))
+                                c("c", "map<string,int>"),
+                                c("d", "struct<a:string,b:int>")))
   expect_equal(count(df), 1)
   ldf <- collect(df)
-  expect_equal(names(ldf), c("a", "b", "c"))
+  expect_equal(names(ldf), c("a", "b", "c", "d"))
   expect_equal(ldf[1, 1][[1]], l[[1]])
   expect_equal(ldf[1, 2][[1]], l[[2]])
+
   e <- ldf$c[[1]]
   expect_equal(class(e), "environment")
   expect_equal(ls(e), "n")
   expect_equal(e$n, 3L)
+
+  s <- ldf$d[[1]]
+  expect_equal(class(s), "struct")
+  expect_equal(s$a, "aa")
+  expect_equal(s$b, 3L)
 })
 
-# For test map type in DataFrame
+# For test map type and struct type in DataFrame
 mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
                       "{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
                       "{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
@@ -308,7 +303,19 @@ test_that("Collect DataFrame with complex types", {
   expect_equal(bob$age, 16)
   expect_equal(bob$height, 176.5)
 
-  # TODO: tests for StructType after it is supported
+  # StructType
+  df <- jsonFile(sqlContext, mapTypeJsonPath)
+  expect_equal(dtypes(df), list(c("info", "struct<age:bigint,height:double>"),
+                                c("name", "string")))
+  ldf <- collect(df)
+  expect_equal(nrow(ldf), 3)
+  expect_equal(ncol(ldf), 2)
+  expect_equal(names(ldf), c("info", "name"))
+  expect_equal(ldf$name, c("Bob", "Alice", "David"))
+  bob <- ldf$info[[1]]
+  expect_equal(class(bob), "struct")
+  expect_equal(bob$age, 16)
+  expect_equal(bob$height, 176.5)
 })
 
 test_that("jsonFile() on a local file returns a DataFrame", {
diff --git a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
index 0c78613e406e1..da126bac7ad1f 100644
--- a/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
+++ b/core/src/main/scala/org/apache/spark/api/r/SerDe.scala
@@ -27,6 +27,14 @@ import scala.collection.mutable.WrappedArray
  * Utility functions to serialize, deserialize objects to / from R
  */
 private[spark] object SerDe {
+  type ReadObject = (DataInputStream, Char) => Object
+  type WriteObject = (DataOutputStream, Object) => Boolean
+
+  var sqlSerDe: (ReadObject, WriteObject) = _
+
+  def registerSqlSerDe(sqlSerDe: (ReadObject, WriteObject)): Unit = {
+    this.sqlSerDe = sqlSerDe
+  }
 
   // Type mapping from R to Java
   //
@@ -63,11 +71,22 @@ private[spark] object SerDe {
       case 'c' => readString(dis)
       case 'e' => readMap(dis)
       case 'r' => readBytes(dis)
+      case 'a' => readArray(dis)
       case 'l' => readList(dis)
       case 'D' => readDate(dis)
       case 't' => readTime(dis)
       case 'j' => JVMObjectTracker.getObject(readString(dis))
-      case _ => throw new IllegalArgumentException(s"Invalid type $dataType")
+      case _ =>
+        if (sqlSerDe == null || sqlSerDe._1 == null) {
+          throw new IllegalArgumentException (s"Invalid type $dataType")
+        } else {
+          val obj = (sqlSerDe._1)(dis, dataType)
+          if (obj == null) {
+            throw new IllegalArgumentException (s"Invalid type $dataType")
+          } else {
+            obj
+          }
+        }
     }
   }
 
@@ -141,7 +160,8 @@ private[spark] object SerDe {
     (0 until len).map(_ => readString(in)).toArray
   }
 
-  def readList(dis: DataInputStream): Array[_] = {
+  // All elements of an array must be of the same type
+  def readArray(dis: DataInputStream): Array[_] = {
     val arrType = readObjectType(dis)
     arrType match {
       case 'i' => readIntArr(dis)
@@ -150,26 +170,43 @@ private[spark] object SerDe {
       case 'b' => readBooleanArr(dis)
       case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
       case 'r' => readBytesArr(dis)
-      case 'l' => {
+      case 'a' =>
+        val len = readInt(dis)
+        (0 until len).map(_ => readArray(dis)).toArray
+      case 'l' =>
         val len = readInt(dis)
         (0 until len).map(_ => readList(dis)).toArray
-      }
-      case _ => throw new IllegalArgumentException(s"Invalid array type $arrType")
+      case _ =>
+        if (sqlSerDe == null || sqlSerDe._1 == null) {
+          throw new IllegalArgumentException (s"Invalid array type $arrType")
+        } else {
+          val len = readInt(dis)
+          (0 until len).map { _ =>
+            val obj = (sqlSerDe._1)(dis, arrType)
+            if (obj == null) {
+              throw new IllegalArgumentException (s"Invalid array type $arrType")
+            } else {
+              obj
+            }
+          }.toArray
+        }
     }
   }
 
+  // Each element of a list can be of different type. They are all represented
+  // as Object on JVM side
+  def readList(dis: DataInputStream): Array[Object] = {
+    val len = readInt(dis)
+    (0 until len).map(_ => readObject(dis)).toArray
+  }
+
   def readMap(in: DataInputStream): java.util.Map[Object, Object] = {
     val len = readInt(in)
     if (len > 0) {
-      val keysType = readObjectType(in)
-      val keysLen = readInt(in)
-      val keys = (0 until keysLen).map(_ => readTypedObject(in, keysType))
-
-      val valuesLen = readInt(in)
-      val values = (0 until valuesLen).map(_ => {
-        val valueType = readObjectType(in)
-        readTypedObject(in, valueType)
-      })
+      // Keys is an array of String
+      val keys = readArray(in).asInstanceOf[Array[Object]]
+      val values = readList(in)
+
       keys.zip(values).toMap.asJava
     } else {
       new java.util.HashMap[Object, Object]()
@@ -338,8 +375,10 @@ private[spark] object SerDe {
           }
 
         case _ =>
-          writeType(dos, "jobj")
-          writeJObj(dos, value)
+          if (sqlSerDe == null || sqlSerDe._2 == null || !(sqlSerDe._2)(dos, value)) {
+            writeType(dos, "jobj")
+            writeJObj(dos, value)
+          }
       }
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index f45d119c8cfdf..b0120a8d0dc4f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -22,13 +22,15 @@ import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, Da
 import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
 import org.apache.spark.api.r.SerDe
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Expression, NamedExpression, GenericRowWithSchema}
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{Column, DataFrame, GroupedData, Row, SQLContext, SaveMode}
 
 import scala.util.matching.Regex
 
 private[r] object SQLUtils {
+  SerDe.registerSqlSerDe((readSqlObject, writeSqlObject))
+
   def createSQLContext(jsc: JavaSparkContext): SQLContext = {
     new SQLContext(jsc)
   }
@@ -61,15 +63,27 @@ private[r] object SQLUtils {
       case "boolean" => org.apache.spark.sql.types.BooleanType
       case "timestamp" => org.apache.spark.sql.types.TimestampType
       case "date" => org.apache.spark.sql.types.DateType
-      case r"\Aarray<(.*)${elemType}>\Z" => {
+      case r"\Aarray<(.+)${elemType}>\Z" =>
         org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType))
-      }
-      case r"\Amap<(.*)${keyType},(.*)${valueType}>\Z" => {
+      case r"\Amap<(.+)${keyType},(.+)${valueType}>\Z" =>
         if (keyType != "string" && keyType != "character") {
           throw new IllegalArgumentException("Key type of a map must be string or character")
         }
         org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType))
-      }
+      case r"\Astruct<(.+)${fieldsStr}>\Z" =>
+        if (fieldsStr(fieldsStr.length - 1) == ',') {
+          throw new IllegalArgumentException(s"Invaid type $dataType")
+        }
+        val fields = fieldsStr.split(",")
+        val structFields = fields.map { field =>
+          field match {
+            case r"\A(.+)${fieldName}:(.+)${fieldType}\Z" =>
+              createStructField(fieldName, fieldType, true)
+
+            case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
+          }
+        }
+        createStructType(structFields)
       case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
     }
   }
@@ -151,4 +165,27 @@ private[r] object SQLUtils {
       options: java.util.Map[String, String]): DataFrame = {
     sqlContext.read.format(source).schema(schema).options(options).load()
   }
+
+  def readSqlObject(dis: DataInputStream, dataType: Char): Object = {
+    dataType match {
+      case 's' =>
+        // Read StructType for DataFrame
+        val fields = SerDe.readList(dis).asInstanceOf[Array[Object]]
+        Row.fromSeq(fields)
+      case _ => null
+    }
+  }
+
+  def writeSqlObject(dos: DataOutputStream, obj: Object): Boolean = {
+    obj match {
+      // Handle struct type in DataFrame
+      case v: GenericRowWithSchema =>
+        dos.writeByte('s')
+        SerDe.writeObject(dos, v.schema.fieldNames)
+        SerDe.writeObject(dos, v.values)
+        true
+      case _ =>
+        false
+    }
+  }
 }

From 1e0aba90b9e73834af70d196f7f869b062d98d94 Mon Sep 17 00:00:00 2001
From: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>
Date: Tue, 13 Oct 2015 10:09:05 -0700
Subject: [PATCH 035/139] [SPARK-10888] [SPARKR] Added as.DataFrame as a
 synonym to createDataFrame

as.DataFrame is more a R-style like signature.
Also, I'd like to know if we could make the context, e.g. sqlContext global, so that we do not have to specify it as an argument, when we each time create a dataframe.

Author: Narine Kokhlikyan <narine.kokhlikyan@gmail.com>

Closes #8952 from NarineK/sparkrasDataFrame.
---
 R/pkg/NAMESPACE                  |  3 ++-
 R/pkg/R/SQLContext.R             | 17 +++++++++++++----
 R/pkg/inst/tests/test_sparkSQL.R | 15 +++++++++++++++
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 95d949ee3e5a4..41986a5e7ab7d 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -228,7 +228,8 @@ exportMethods("agg")
 export("sparkRSQL.init",
        "sparkRHive.init")
 
-export("cacheTable",
+export("as.DataFrame",
+       "cacheTable",
        "clearCache",
        "createDataFrame",
        "createExternalTable",
diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R
index 66c7e307212c3..399f53657a68c 100644
--- a/R/pkg/R/SQLContext.R
+++ b/R/pkg/R/SQLContext.R
@@ -64,21 +64,23 @@ infer_type <- function(x) {
   }
 }
 
-#' Create a DataFrame from an RDD
+#' Create a DataFrame
 #'
-#' Converts an RDD to a DataFrame by infer the types.
+#' Converts R data.frame or list into DataFrame.
 #'
 #' @param sqlContext A SQLContext
 #' @param data An RDD or list or data.frame
 #' @param schema a list of column names or named list (StructType), optional
 #' @return an DataFrame
+#' @rdname createDataFrame
 #' @export
 #' @examples
 #'\dontrun{
 #' sc <- sparkR.init()
 #' sqlContext <- sparkRSQL.init(sc)
-#' rdd <- lapply(parallelize(sc, 1:10), function(x) list(a=x, b=as.character(x)))
-#' df <- createDataFrame(sqlContext, rdd)
+#' df1 <- as.DataFrame(sqlContext, iris)
+#' df2 <- as.DataFrame(sqlContext, list(3,4,5,6))
+#' df3 <- createDataFrame(sqlContext, iris)
 #' }
 
 # TODO(davies): support sampling and infer type from NA
@@ -151,6 +153,13 @@ createDataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0
   dataFrame(sdf)
 }
 
+#' @rdname createDataFrame
+#' @aliases createDataFrame
+#' @export
+as.DataFrame <- function(sqlContext, data, schema = NULL, samplingRatio = 1.0) {
+  createDataFrame(sqlContext, data, schema, samplingRatio)
+}
+
 # toDF
 #
 # Converts an RDD to a DataFrame by infer the types.
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index af6efa40fb2f6..b599994854670 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -89,17 +89,28 @@ test_that("structType and structField", {
 test_that("create DataFrame from RDD", {
   rdd <- lapply(parallelize(sc, 1:10), function(x) { list(x, as.character(x)) })
   df <- createDataFrame(sqlContext, rdd, list("a", "b"))
+  dfAsDF <- as.DataFrame(sqlContext, rdd, list("a", "b"))
   expect_is(df, "DataFrame")
+  expect_is(dfAsDF, "DataFrame")
   expect_equal(count(df), 10)
+  expect_equal(count(dfAsDF), 10)
   expect_equal(nrow(df), 10)
+  expect_equal(nrow(dfAsDF), 10)
   expect_equal(ncol(df), 2)
+  expect_equal(ncol(dfAsDF), 2)
   expect_equal(dim(df), c(10, 2))
+  expect_equal(dim(dfAsDF), c(10, 2))
   expect_equal(columns(df), c("a", "b"))
+  expect_equal(columns(dfAsDF), c("a", "b"))
   expect_equal(dtypes(df), list(c("a", "int"), c("b", "string")))
+  expect_equal(dtypes(dfAsDF), list(c("a", "int"), c("b", "string")))
 
   df <- createDataFrame(sqlContext, rdd)
+  dfAsDF <- as.DataFrame(sqlContext, rdd)
   expect_is(df, "DataFrame")
+  expect_is(dfAsDF, "DataFrame")
   expect_equal(columns(df), c("_1", "_2"))
+  expect_equal(columns(dfAsDF), c("_1", "_2"))
 
   schema <- structType(structField(x = "a", type = "integer", nullable = TRUE),
                         structField(x = "b", type = "string", nullable = TRUE))
@@ -130,9 +141,13 @@ test_that("create DataFrame from RDD", {
   schema <- structType(structField("name", "string"), structField("age", "integer"),
                        structField("height", "float"))
   df2 <- createDataFrame(sqlContext, df.toRDD, schema)
+  df2AsDF <- as.DataFrame(sqlContext, df.toRDD, schema)
   expect_equal(columns(df2), c("name", "age", "height"))
+  expect_equal(columns(df2AsDF), c("name", "age", "height"))
   expect_equal(dtypes(df2), list(c("name", "string"), c("age", "int"), c("height", "float")))
+  expect_equal(dtypes(df2AsDF), list(c("name", "string"), c("age", "int"), c("height", "float")))
   expect_equal(collect(where(df2, df2$name == "Bob")), c("Bob", 16, 176.5))
+  expect_equal(collect(where(df2AsDF, df2$name == "Bob")), c("Bob", 16, 176.5))
 
   localDF <- data.frame(name=c("John", "Smith", "Sarah"),
                         age=c(19, 23, 18),

From f7f28ee7a513c262d52cf433d25fbf06df9bd1f1 Mon Sep 17 00:00:00 2001
From: Adrian Zhuang <adrian555@users.noreply.github.com>
Date: Tue, 13 Oct 2015 10:21:07 -0700
Subject: [PATCH 036/139] [SPARK-10913] [SPARKR] attach() function support

Bring the change code up to date.

Author: Adrian Zhuang <adrian555@users.noreply.github.com>
Author: adrian555 <wzhuang@us.ibm.com>

Closes #9031 from adrian555/attach2.
---
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/DataFrame.R              | 30 ++++++++++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R | 20 ++++++++++++++++++++
 4 files changed, 55 insertions(+)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 41986a5e7ab7d..ed9cd94e03b13 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -23,6 +23,7 @@ export("setJobGroup",
 exportClasses("DataFrame")
 
 exportMethods("arrange",
+              "attach",
               "cache",
               "collect",
               "columns",
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 1b9137e6c7934..e0ce056243585 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1881,3 +1881,33 @@ setMethod("as.data.frame",
             }
             collect(x)
           })
+
+#' The specified DataFrame is attached to the R search path. This means that
+#' the DataFrame is searched by R when evaluating a variable, so columns in
+#' the DataFrame can be accessed by simply giving their names.
+#'
+#' @rdname attach
+#' @title Attach DataFrame to R search path
+#' @param what (DataFrame) The DataFrame to attach
+#' @param pos (integer) Specify position in search() where to attach.
+#' @param name (character) Name to use for the attached DataFrame. Names
+#'   starting with package: are reserved for library.
+#' @param warn.conflicts (logical) If TRUE, warnings are printed about conflicts
+#' from attaching the database, unless that DataFrame contains an object
+#' @examples
+#' \dontrun{
+#' attach(irisDf)
+#' summary(Sepal_Width)
+#' }
+#' @seealso \link{detach}
+setMethod("attach",
+          signature(what = "DataFrame"),
+          function(what, pos = 2, name = deparse(substitute(what)), warn.conflicts = TRUE) {
+            cols <- columns(what)
+            stopifnot(length(cols) > 0)
+            newEnv <- new.env()
+            for (i in 1:length(cols)) {
+              assign(x = cols[i], value = what[, cols[i]], envir = newEnv)
+            }
+            attach(newEnv, pos = pos, name = name, warn.conflicts = warn.conflicts)
+          })
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 8fad17026c06f..c106a0024583e 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -1003,3 +1003,7 @@ setGeneric("rbind", signature = "...")
 #' @rdname as.data.frame
 #' @export
 setGeneric("as.data.frame")
+
+#' @rdname attach
+#' @export
+setGeneric("attach")
\ No newline at end of file
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index b599994854670..d5509e475de05 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1405,6 +1405,26 @@ test_that("Method as.data.frame as a synonym for collect()", {
   expect_equal(as.data.frame(irisDF2), collect(irisDF2))
 })
 
+test_that("attach() on a DataFrame", {
+  df <- jsonFile(sqlContext, jsonPath)
+  expect_error(age)
+  attach(df)
+  expect_is(age, "DataFrame")
+  expected_age <- data.frame(age = c(NA, 30, 19))
+  expect_equal(head(age), expected_age)
+  stat <- summary(age)
+  expect_equal(collect(stat)[5, "age"], "30")
+  age <- age$age + 1
+  expect_is(age, "Column")
+  rm(age)
+  stat2 <- summary(age)
+  expect_equal(collect(stat2)[5, "age"], "30")
+  detach("df")
+  stat3 <- summary(df[, "age"])
+  expect_equal(collect(stat3)[5, "age"], "30")
+  expect_error(age)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)
 unlink(jsonPathNa)

From c75f058b72d492d6de898957b3058f242d70dd8a Mon Sep 17 00:00:00 2001
From: "Joseph K. Bradley" <joseph@databricks.com>
Date: Tue, 13 Oct 2015 12:03:46 -0700
Subject: [PATCH 037/139] [PYTHON] [MINOR] List modules in PySpark tests when
 given bad name

Output list of supported modules for python tests in error message when given bad module name.

CC: davies

Author: Joseph K. Bradley <joseph@databricks.com>

Closes #9088 from jkbradley/python-tests-modules.
---
 python/run-tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index fd56c7ab6e0e2..152f5cc98d0fd 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -167,7 +167,8 @@ def main():
         if module_name in python_modules:
             modules_to_test.append(python_modules[module_name])
         else:
-            print("Error: unrecognized module %s" % module_name)
+            print("Error: unrecognized module '%s'. Supported modules: %s" %
+                  (module_name, ", ".join(python_modules)))
             sys.exit(-1)
     LOGGER.info("Will test against the following Python executables: %s", python_execs)
     LOGGER.info("Will test the following Python modules: %s", [x.name for x in modules_to_test])

From 2b574f52d7bf51b1fe2a73086a3735b633e9083f Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 13 Oct 2015 13:24:10 -0700
Subject: [PATCH 038/139] [SPARK-7402] [ML] JSON SerDe for standard param types

This PR implements the JSON SerDe for the following param types: `Boolean`, `Int`, `Long`, `Float`, `Double`, `String`, `Array[Int]`, `Array[Double]`, and `Array[String]`. The implementation of `Float`, `Double`, and `Array[Double]` are specialized to handle `NaN` and `Inf`s. This will be used in pipeline persistence. jkbradley

Author: Xiangrui Meng <meng@databricks.com>

Closes #9090 from mengxr/SPARK-7402.
---
 .../org/apache/spark/ml/param/params.scala    | 169 ++++++++++++++++++
 .../apache/spark/ml/param/ParamsSuite.scala   | 114 ++++++++++++
 2 files changed, 283 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
index ec98b05e13b89..8361406f87299 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/params.scala
@@ -24,6 +24,9 @@ import scala.annotation.varargs
 import scala.collection.mutable
 import scala.collection.JavaConverters._
 
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
 import org.apache.spark.annotation.{DeveloperApi, Experimental}
 import org.apache.spark.ml.util.Identifiable
 
@@ -80,6 +83,30 @@ class Param[T](val parent: String, val name: String, val doc: String, val isVali
   /** Creates a param pair with the given value (for Scala). */
   def ->(value: T): ParamPair[T] = ParamPair(this, value)
 
+  /** Encodes a param value into JSON, which can be decoded by [[jsonDecode()]]. */
+  def jsonEncode(value: T): String = {
+    value match {
+      case x: String =>
+        compact(render(JString(x)))
+      case _ =>
+        throw new NotImplementedError(
+          "The default jsonEncode only supports string. " +
+            s"${this.getClass.getName} must override jsonEncode for ${value.getClass.getName}.")
+    }
+  }
+
+  /** Decodes a param value from JSON. */
+  def jsonDecode(json: String): T = {
+    parse(json) match {
+      case JString(x) =>
+        x.asInstanceOf[T]
+      case _ =>
+        throw new NotImplementedError(
+          "The default jsonDecode only supports string. " +
+            s"${this.getClass.getName} must override jsonDecode to support its value type.")
+    }
+  }
+
   override final def toString: String = s"${parent}__$name"
 
   override final def hashCode: Int = toString.##
@@ -198,6 +225,46 @@ class DoubleParam(parent: String, name: String, doc: String, isValid: Double =>
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Double): ParamPair[Double] = super.w(value)
+
+  override def jsonEncode(value: Double): String = {
+    compact(render(DoubleParam.jValueEncode(value)))
+  }
+
+  override def jsonDecode(json: String): Double = {
+    DoubleParam.jValueDecode(parse(json))
+  }
+}
+
+private[param] object DoubleParam {
+  /** Encodes a param value into JValue. */
+  def jValueEncode(value: Double): JValue = {
+    value match {
+      case _ if value.isNaN =>
+        JString("NaN")
+      case Double.NegativeInfinity =>
+        JString("-Inf")
+      case Double.PositiveInfinity =>
+        JString("Inf")
+      case _ =>
+        JDouble(value)
+    }
+  }
+
+  /** Decodes a param value from JValue. */
+  def jValueDecode(jValue: JValue): Double = {
+    jValue match {
+      case JString("NaN") =>
+        Double.NaN
+      case JString("-Inf") =>
+        Double.NegativeInfinity
+      case JString("Inf") =>
+        Double.PositiveInfinity
+      case JDouble(x) =>
+        x
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot decode $jValue to Double.")
+    }
+  }
 }
 
 /**
@@ -218,6 +285,15 @@ class IntParam(parent: String, name: String, doc: String, isValid: Int => Boolea
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Int): ParamPair[Int] = super.w(value)
+
+  override def jsonEncode(value: Int): String = {
+    compact(render(JInt(value)))
+  }
+
+  override def jsonDecode(json: String): Int = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Int]
+  }
 }
 
 /**
@@ -238,6 +314,47 @@ class FloatParam(parent: String, name: String, doc: String, isValid: Float => Bo
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Float): ParamPair[Float] = super.w(value)
+
+  override def jsonEncode(value: Float): String = {
+    compact(render(FloatParam.jValueEncode(value)))
+  }
+
+  override def jsonDecode(json: String): Float = {
+    FloatParam.jValueDecode(parse(json))
+  }
+}
+
+private object FloatParam {
+
+  /** Encodes a param value into JValue. */
+  def jValueEncode(value: Float): JValue = {
+    value match {
+      case _ if value.isNaN =>
+        JString("NaN")
+      case Float.NegativeInfinity =>
+        JString("-Inf")
+      case Float.PositiveInfinity =>
+        JString("Inf")
+      case _ =>
+        JDouble(value)
+    }
+  }
+
+  /** Decodes a param value from JValue. */
+  def jValueDecode(jValue: JValue): Float = {
+    jValue match {
+      case JString("NaN") =>
+        Float.NaN
+      case JString("-Inf") =>
+        Float.NegativeInfinity
+      case JString("Inf") =>
+        Float.PositiveInfinity
+      case JDouble(x) =>
+        x.toFloat
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot decode $jValue to Float.")
+    }
+  }
 }
 
 /**
@@ -258,6 +375,15 @@ class LongParam(parent: String, name: String, doc: String, isValid: Long => Bool
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Long): ParamPair[Long] = super.w(value)
+
+  override def jsonEncode(value: Long): String = {
+    compact(render(JInt(value)))
+  }
+
+  override def jsonDecode(json: String): Long = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Long]
+  }
 }
 
 /**
@@ -272,6 +398,15 @@ class BooleanParam(parent: String, name: String, doc: String) // No need for isV
 
   /** Creates a param pair with the given value (for Java). */
   override def w(value: Boolean): ParamPair[Boolean] = super.w(value)
+
+  override def jsonEncode(value: Boolean): String = {
+    compact(render(JBool(value)))
+  }
+
+  override def jsonDecode(json: String): Boolean = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Boolean]
+  }
 }
 
 /**
@@ -287,6 +422,16 @@ class StringArrayParam(parent: Params, name: String, doc: String, isValid: Array
 
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
   def w(value: java.util.List[String]): ParamPair[Array[String]] = w(value.asScala.toArray)
+
+  override def jsonEncode(value: Array[String]): String = {
+    import org.json4s.JsonDSL._
+    compact(render(value.toSeq))
+  }
+
+  override def jsonDecode(json: String): Array[String] = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Seq[String]].toArray
+  }
 }
 
 /**
@@ -303,6 +448,20 @@ class DoubleArrayParam(parent: Params, name: String, doc: String, isValid: Array
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
   def w(value: java.util.List[java.lang.Double]): ParamPair[Array[Double]] =
     w(value.asScala.map(_.asInstanceOf[Double]).toArray)
+
+  override def jsonEncode(value: Array[Double]): String = {
+    import org.json4s.JsonDSL._
+    compact(render(value.toSeq.map(DoubleParam.jValueEncode)))
+  }
+
+  override def jsonDecode(json: String): Array[Double] = {
+    parse(json) match {
+      case JArray(values) =>
+        values.map(DoubleParam.jValueDecode).toArray
+      case _ =>
+        throw new IllegalArgumentException(s"Cannot decode $json to Array[Double].")
+    }
+  }
 }
 
 /**
@@ -319,6 +478,16 @@ class IntArrayParam(parent: Params, name: String, doc: String, isValid: Array[In
   /** Creates a param pair with a [[java.util.List]] of values (for Java and Python). */
   def w(value: java.util.List[java.lang.Integer]): ParamPair[Array[Int]] =
     w(value.asScala.map(_.asInstanceOf[Int]).toArray)
+
+  override def jsonEncode(value: Array[Int]): String = {
+    import org.json4s.JsonDSL._
+    compact(render(value.toSeq))
+  }
+
+  override def jsonDecode(json: String): Array[Int] = {
+    implicit val formats = DefaultFormats
+    parse(json).extract[Seq[Int]].toArray
+  }
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index a2ea279f5d5e4..eeb03dba2f825 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -21,6 +21,120 @@ import org.apache.spark.SparkFunSuite
 
 class ParamsSuite extends SparkFunSuite {
 
+  test("json encode/decode") {
+    val dummy = new Params {
+      override def copy(extra: ParamMap): Params = defaultCopy(extra)
+
+      override val uid: String = "dummy"
+    }
+
+    { // BooleanParam
+      val param = new BooleanParam(dummy, "name", "doc")
+      for (value <- Seq(true, false)) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // IntParam
+      val param = new IntParam(dummy, "name", "doc")
+      for (value <- Seq(Int.MinValue, -1, 0, 1, Int.MaxValue)) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // LongParam
+      val param = new LongParam(dummy, "name", "doc")
+      for (value <- Seq(Long.MinValue, -1L, 0L, 1L, Long.MaxValue)) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // FloatParam
+      val param = new FloatParam(dummy, "name", "doc")
+      for (value <- Seq(Float.NaN, Float.NegativeInfinity, Float.MinValue, -1.0f, -0.5f, 0.0f,
+        Float.MinPositiveValue, 0.5f, 1.0f, Float.MaxValue, Float.PositiveInfinity)) {
+        val json = param.jsonEncode(value)
+        val decoded = param.jsonDecode(json)
+        if (value.isNaN) {
+          assert(decoded.isNaN)
+        } else {
+          assert(decoded === value)
+        }
+      }
+    }
+
+    { // DoubleParam
+      val param = new DoubleParam(dummy, "name", "doc")
+      for (value <- Seq(Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, -0.5, 0.0,
+          Double.MinPositiveValue, 0.5, 1.0, Double.MaxValue, Double.PositiveInfinity)) {
+        val json = param.jsonEncode(value)
+        val decoded = param.jsonDecode(json)
+        if (value.isNaN) {
+          assert(decoded.isNaN)
+        } else {
+          assert(decoded === value)
+        }
+      }
+    }
+
+    { // StringParam
+      val param = new Param[String](dummy, "name", "doc")
+      // Currently we do not support null.
+      for (value <- Seq("", "1", "abc", "quote\"", "newline\n")) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // IntArrayParam
+      val param = new IntArrayParam(dummy, "name", "doc")
+      val values: Seq[Array[Int]] = Seq(
+        Array(),
+        Array(1),
+        Array(Int.MinValue, 0, Int.MaxValue))
+      for (value <- values) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+
+    { // DoubleArrayParam
+      val param = new DoubleArrayParam(dummy, "name", "doc")
+      val values: Seq[Array[Double]] = Seq(
+         Array(),
+         Array(1.0),
+         Array(Double.NaN, Double.NegativeInfinity, Double.MinValue, -1.0, 0.0,
+           Double.MinPositiveValue, 1.0, Double.MaxValue, Double.PositiveInfinity))
+      for (value <- values) {
+        val json = param.jsonEncode(value)
+        val decoded = param.jsonDecode(json)
+        assert(decoded.length === value.length)
+        decoded.zip(value).foreach { case (actual, expected) =>
+          if (expected.isNaN) {
+            assert(actual.isNaN)
+          } else {
+            assert(actual === expected)
+          }
+        }
+      }
+    }
+
+    { // StringArrayParam
+      val param = new StringArrayParam(dummy, "name", "doc")
+      val values: Seq[Array[String]] = Seq(
+        Array(),
+        Array(""),
+        Array("", "1", "abc", "quote\"", "newline\n"))
+      for (value <- values) {
+        val json = param.jsonEncode(value)
+        assert(param.jsonDecode(json) === value)
+      }
+    }
+  }
+
   test("param") {
     val solver = new TestParams()
     val uid = solver.uid

From b3ffac5178795f2d8e7908b3e77e8e89f50b5f6f Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Tue, 13 Oct 2015 13:49:59 -0700
Subject: [PATCH 039/139] [SPARK-10983] Unified memory manager
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch unifies the memory management of the storage and execution regions such that either side can borrow memory from each other. When memory pressure arises, storage will be evicted in favor of execution. To avoid regressions in cases where storage is crucial, we dynamically allocate a fraction of space for storage that execution cannot evict. Several configurations are introduced:

- **spark.memory.fraction (default 0.75)**: ​fraction of the heap space used for execution and storage. The lower this is, the more frequently spills and cached data eviction occur. The purpose of this config is to set aside memory for internal metadata, user data structures, and imprecise size estimation in the case of sparse, unusually large records.

- **spark.memory.storageFraction (default 0.5)**: size of the storage region within the space set aside by `s​park.memory.fraction`. ​Cached data may only be evicted if total storage exceeds this region.

- **spark.memory.useLegacyMode (default false)**: whether to use the memory management that existed in Spark 1.5 and before. This is mainly for backward compatibility.

For a detailed description of the design, see [SPARK-10000](https://issues.apache.org/jira/browse/SPARK-10000). This patch builds on top of the `MemoryManager` interface introduced in #9000.

Author: Andrew Or <andrew@databricks.com>

Closes #9084 from andrewor14/unified-memory-manager.
---
 .../scala/org/apache/spark/SparkConf.scala    |  23 +-
 .../scala/org/apache/spark/SparkEnv.scala     |  11 +-
 .../apache/spark/memory/MemoryManager.scala   |  83 +++++--
 .../spark/memory/StaticMemoryManager.scala    | 105 +++------
 .../spark/memory/UnifiedMemoryManager.scala   | 141 ++++++++++++
 .../spark/shuffle/ShuffleMemoryManager.scala  |  38 ++--
 .../apache/spark/storage/BlockManager.scala   |   4 +
 .../apache/spark/storage/MemoryStore.scala    | 121 ++++++----
 .../collection/ExternalAppendOnlyMap.scala    |  10 -
 .../org/apache/spark/DistributedSuite.scala   |   7 +-
 .../scala/org/apache/spark/ShuffleSuite.scala |   6 +-
 .../spark/memory/MemoryManagerSuite.scala     | 133 +++++++++++
 .../memory/StaticMemoryManagerSuite.scala     | 105 ++++-----
 .../memory/UnifiedMemoryManagerSuite.scala    | 208 ++++++++++++++++++
 .../shuffle/ShuffleMemoryManagerSuite.scala   |   5 +-
 .../shuffle/unsafe/UnsafeShuffleSuite.scala   |   3 -
 .../ExternalAppendOnlyMapSuite.scala          |   9 +-
 .../util/collection/ExternalSorterSuite.scala |  23 +-
 docs/configuration.md                         |  99 ++++++---
 .../execution/TestShuffleMemoryManager.scala  |  10 +-
 .../execution/UnsafeRowSerializerSuite.scala  |   2 +-
 21 files changed, 840 insertions(+), 306 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
 create mode 100644 core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
 create mode 100644 core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index b344b5e173d67..1a0ac3d01759c 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -418,16 +418,35 @@ class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
     }
 
     // Validate memory fractions
-    val memoryKeys = Seq(
+    val deprecatedMemoryKeys = Seq(
       "spark.storage.memoryFraction",
       "spark.shuffle.memoryFraction",
       "spark.shuffle.safetyFraction",
       "spark.storage.unrollFraction",
       "spark.storage.safetyFraction")
+    val memoryKeys = Seq(
+      "spark.memory.fraction",
+      "spark.memory.storageFraction") ++
+      deprecatedMemoryKeys
     for (key <- memoryKeys) {
       val value = getDouble(key, 0.5)
       if (value > 1 || value < 0) {
-        throw new IllegalArgumentException("$key should be between 0 and 1 (was '$value').")
+        throw new IllegalArgumentException(s"$key should be between 0 and 1 (was '$value').")
+      }
+    }
+
+    // Warn against deprecated memory fractions (unless legacy memory management mode is enabled)
+    val legacyMemoryManagementKey = "spark.memory.useLegacyMode"
+    val legacyMemoryManagement = getBoolean(legacyMemoryManagementKey, false)
+    if (!legacyMemoryManagement) {
+      val keyset = deprecatedMemoryKeys.toSet
+      val detected = settings.keys().asScala.filter(keyset.contains)
+      if (detected.nonEmpty) {
+        logWarning("Detected deprecated memory fraction settings: " +
+          detected.mkString("[", ", ", "]") + ". As of Spark 1.6, execution and storage " +
+          "memory management are unified. All memory fractions used in the old model are " +
+          "now deprecated and no longer read. If you wish to use the old memory management, " +
+          s"you may explicitly enable `$legacyMemoryManagementKey` (not recommended).")
       }
     }
 
diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
index df3d84a1f08e9..c32998345145a 100644
--- a/core/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -30,7 +30,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.PythonWorkerFactory
 import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.metrics.MetricsSystem
-import org.apache.spark.memory.{MemoryManager, StaticMemoryManager}
+import org.apache.spark.memory.{MemoryManager, StaticMemoryManager, UnifiedMemoryManager}
 import org.apache.spark.network.BlockTransferService
 import org.apache.spark.network.netty.NettyBlockTransferService
 import org.apache.spark.rpc.{RpcEndpointRef, RpcEndpoint, RpcEnv}
@@ -335,7 +335,14 @@ object SparkEnv extends Logging {
     val shuffleMgrClass = shortShuffleMgrNames.getOrElse(shuffleMgrName.toLowerCase, shuffleMgrName)
     val shuffleManager = instantiateClass[ShuffleManager](shuffleMgrClass)
 
-    val memoryManager = new StaticMemoryManager(conf)
+    val useLegacyMemoryManager = conf.getBoolean("spark.memory.useLegacyMode", false)
+    val memoryManager: MemoryManager =
+      if (useLegacyMemoryManager) {
+        new StaticMemoryManager(conf)
+      } else {
+        new UnifiedMemoryManager(conf)
+      }
+
     val shuffleMemoryManager = ShuffleMemoryManager.create(conf, memoryManager, numUsableCores)
 
     val blockTransferService = new NettyBlockTransferService(conf, securityManager, numUsableCores)
diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
index 4bf73b696920d..7168ac549106f 100644
--- a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -19,6 +19,7 @@ package org.apache.spark.memory
 
 import scala.collection.mutable
 
+import org.apache.spark.Logging
 import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
 
 
@@ -29,7 +30,7 @@ import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore}
  * sorts and aggregations, while storage memory refers to that used for caching and propagating
  * internal data across the cluster. There exists one of these per JVM.
  */
-private[spark] abstract class MemoryManager {
+private[spark] abstract class MemoryManager extends Logging {
 
   // The memory store used to evict cached blocks
   private var _memoryStore: MemoryStore = _
@@ -40,19 +41,38 @@ private[spark] abstract class MemoryManager {
     _memoryStore
   }
 
+  // Amount of execution/storage memory in use, accesses must be synchronized on `this`
+  protected var _executionMemoryUsed: Long = 0
+  protected var _storageMemoryUsed: Long = 0
+
   /**
    * Set the [[MemoryStore]] used by this manager to evict cached blocks.
    * This must be set after construction due to initialization ordering constraints.
    */
-  def setMemoryStore(store: MemoryStore): Unit = {
+  final def setMemoryStore(store: MemoryStore): Unit = {
     _memoryStore = store
   }
 
   /**
-   * Acquire N bytes of memory for execution.
+   * Total available memory for execution, in bytes.
+   */
+  def maxExecutionMemory: Long
+
+  /**
+   * Total available memory for storage, in bytes.
+   */
+  def maxStorageMemory: Long
+
+  // TODO: avoid passing evicted blocks around to simplify method signatures (SPARK-10985)
+
+  /**
+   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
    * @return number of bytes successfully granted (<= N).
    */
-  def acquireExecutionMemory(numBytes: Long): Long
+  def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long
 
   /**
    * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
@@ -66,52 +86,73 @@ private[spark] abstract class MemoryManager {
 
   /**
    * Acquire N bytes of memory to unroll the given block, evicting existing ones if necessary.
+   *
+   * This extra method allows subclasses to differentiate behavior between acquiring storage
+   * memory and acquiring unroll memory. For instance, the memory management model in Spark
+   * 1.5 and before places a limit on the amount of space that can be freed from unrolling.
    * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   *
    * @return whether all N bytes were successfully granted.
    */
   def acquireUnrollMemory(
       blockId: BlockId,
       numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
+    acquireStorageMemory(blockId, numBytes, evictedBlocks)
+  }
 
   /**
    * Release N bytes of execution memory.
    */
-  def releaseExecutionMemory(numBytes: Long): Unit
+  def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _executionMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of execution " +
+        s"memory when we only have ${_executionMemoryUsed} bytes")
+      _executionMemoryUsed = 0
+    } else {
+      _executionMemoryUsed -= numBytes
+    }
+  }
 
   /**
    * Release N bytes of storage memory.
    */
-  def releaseStorageMemory(numBytes: Long): Unit
+  def releaseStorageMemory(numBytes: Long): Unit = synchronized {
+    if (numBytes > _storageMemoryUsed) {
+      logWarning(s"Attempted to release $numBytes bytes of storage " +
+        s"memory when we only have ${_storageMemoryUsed} bytes")
+      _storageMemoryUsed = 0
+    } else {
+      _storageMemoryUsed -= numBytes
+    }
+  }
 
   /**
    * Release all storage memory acquired.
    */
-  def releaseStorageMemory(): Unit
+  def releaseAllStorageMemory(): Unit = synchronized {
+    _storageMemoryUsed = 0
+  }
 
   /**
    * Release N bytes of unroll memory.
    */
-  def releaseUnrollMemory(numBytes: Long): Unit
-
-  /**
-   * Total available memory for execution, in bytes.
-   */
-  def maxExecutionMemory: Long
-
-  /**
-   * Total available memory for storage, in bytes.
-   */
-  def maxStorageMemory: Long
+  def releaseUnrollMemory(numBytes: Long): Unit = synchronized {
+    releaseStorageMemory(numBytes)
+  }
 
   /**
    * Execution memory currently in use, in bytes.
    */
-  def executionMemoryUsed: Long
+  final def executionMemoryUsed: Long = synchronized {
+    _executionMemoryUsed
+  }
 
   /**
    * Storage memory currently in use, in bytes.
    */
-  def storageMemoryUsed: Long
+  final def storageMemoryUsed: Long = synchronized {
+    _storageMemoryUsed
+  }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
index 150445edb9578..fa44f3723415d 100644
--- a/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/memory/StaticMemoryManager.scala
@@ -19,7 +19,7 @@ package org.apache.spark.memory
 
 import scala.collection.mutable
 
-import org.apache.spark.{Logging, SparkConf}
+import org.apache.spark.SparkConf
 import org.apache.spark.storage.{BlockId, BlockStatus}
 
 
@@ -34,17 +34,7 @@ private[spark] class StaticMemoryManager(
     conf: SparkConf,
     override val maxExecutionMemory: Long,
     override val maxStorageMemory: Long)
-  extends MemoryManager with Logging {
-
-  // Max number of bytes worth of blocks to evict when unrolling
-  private val maxMemoryToEvictForUnroll: Long = {
-    (maxStorageMemory * conf.getDouble("spark.storage.unrollFraction", 0.2)).toLong
-  }
-
-  // Amount of execution / storage memory in use
-  // Accesses must be synchronized on `this`
-  private var _executionMemoryUsed: Long = 0
-  private var _storageMemoryUsed: Long = 0
+  extends MemoryManager {
 
   def this(conf: SparkConf) {
     this(
@@ -53,11 +43,19 @@ private[spark] class StaticMemoryManager(
       StaticMemoryManager.getMaxStorageMemory(conf))
   }
 
+  // Max number of bytes worth of blocks to evict when unrolling
+  private val maxMemoryToEvictForUnroll: Long = {
+    (maxStorageMemory * conf.getDouble("spark.storage.unrollFraction", 0.2)).toLong
+  }
+
   /**
    * Acquire N bytes of memory for execution.
    * @return number of bytes successfully granted (<= N).
    */
-  override def acquireExecutionMemory(numBytes: Long): Long = synchronized {
+  override def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
+    assert(numBytes >= 0)
     assert(_executionMemoryUsed <= maxExecutionMemory)
     val bytesToGrant = math.min(numBytes, maxExecutionMemory - _executionMemoryUsed)
     _executionMemoryUsed += bytesToGrant
@@ -72,7 +70,7 @@ private[spark] class StaticMemoryManager(
   override def acquireStorageMemory(
       blockId: BlockId,
       numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
     acquireStorageMemory(blockId, numBytes, numBytes, evictedBlocks)
   }
 
@@ -88,7 +86,7 @@ private[spark] class StaticMemoryManager(
   override def acquireUnrollMemory(
       blockId: BlockId,
       numBytes: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
     val currentUnrollMemory = memoryStore.currentUnrollMemory
     val maxNumBytesToFree = math.max(0, maxMemoryToEvictForUnroll - currentUnrollMemory)
     val numBytesToFree = math.min(numBytes, maxNumBytesToFree)
@@ -108,71 +106,16 @@ private[spark] class StaticMemoryManager(
       blockId: BlockId,
       numBytesToAcquire: Long,
       numBytesToFree: Long,
-      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    // Note: Keep this outside synchronized block to avoid potential deadlocks!
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
+    assert(numBytesToAcquire >= 0)
+    assert(numBytesToFree >= 0)
     memoryStore.ensureFreeSpace(blockId, numBytesToFree, evictedBlocks)
-    synchronized {
-      assert(_storageMemoryUsed <= maxStorageMemory)
-      val enoughMemory = _storageMemoryUsed + numBytesToAcquire <= maxStorageMemory
-      if (enoughMemory) {
-        _storageMemoryUsed += numBytesToAcquire
-      }
-      enoughMemory
-    }
-  }
-
-  /**
-   * Release N bytes of execution memory.
-   */
-  override def releaseExecutionMemory(numBytes: Long): Unit = synchronized {
-    if (numBytes > _executionMemoryUsed) {
-      logWarning(s"Attempted to release $numBytes bytes of execution " +
-        s"memory when we only have ${_executionMemoryUsed} bytes")
-      _executionMemoryUsed = 0
-    } else {
-      _executionMemoryUsed -= numBytes
-    }
-  }
-
-  /**
-   * Release N bytes of storage memory.
-   */
-  override def releaseStorageMemory(numBytes: Long): Unit = synchronized {
-    if (numBytes > _storageMemoryUsed) {
-      logWarning(s"Attempted to release $numBytes bytes of storage " +
-        s"memory when we only have ${_storageMemoryUsed} bytes")
-      _storageMemoryUsed = 0
-    } else {
-      _storageMemoryUsed -= numBytes
+    assert(_storageMemoryUsed <= maxStorageMemory)
+    val enoughMemory = _storageMemoryUsed + numBytesToAcquire <= maxStorageMemory
+    if (enoughMemory) {
+      _storageMemoryUsed += numBytesToAcquire
     }
-  }
-
-  /**
-   * Release all storage memory acquired.
-   */
-  override def releaseStorageMemory(): Unit = synchronized {
-    _storageMemoryUsed = 0
-  }
-
-  /**
-   * Release N bytes of unroll memory.
-   */
-  override def releaseUnrollMemory(numBytes: Long): Unit = {
-    releaseStorageMemory(numBytes)
-  }
-
-  /**
-   * Amount of execution memory currently in use, in bytes.
-   */
-  override def executionMemoryUsed: Long = synchronized {
-    _executionMemoryUsed
-  }
-
-  /**
-   * Amount of storage memory currently in use, in bytes.
-   */
-  override def storageMemoryUsed: Long = synchronized {
-    _storageMemoryUsed
+    enoughMemory
   }
 
 }
@@ -184,9 +127,10 @@ private[spark] object StaticMemoryManager {
    * Return the total amount of memory available for the storage region, in bytes.
    */
   private def getMaxStorageMemory(conf: SparkConf): Long = {
+    val systemMaxMemory = conf.getLong("spark.testing.memory", Runtime.getRuntime.maxMemory)
     val memoryFraction = conf.getDouble("spark.storage.memoryFraction", 0.6)
     val safetyFraction = conf.getDouble("spark.storage.safetyFraction", 0.9)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+    (systemMaxMemory * memoryFraction * safetyFraction).toLong
   }
 
 
@@ -194,9 +138,10 @@ private[spark] object StaticMemoryManager {
    * Return the total amount of memory available for the execution region, in bytes.
    */
   private def getMaxExecutionMemory(conf: SparkConf): Long = {
+    val systemMaxMemory = conf.getLong("spark.testing.memory", Runtime.getRuntime.maxMemory)
     val memoryFraction = conf.getDouble("spark.shuffle.memoryFraction", 0.2)
     val safetyFraction = conf.getDouble("spark.shuffle.safetyFraction", 0.8)
-    (Runtime.getRuntime.maxMemory * memoryFraction * safetyFraction).toLong
+    (systemMaxMemory * memoryFraction * safetyFraction).toLong
   }
 
 }
diff --git a/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
new file mode 100644
index 0000000000000..5bf78d5b674b3
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/memory/UnifiedMemoryManager.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable
+
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.{BlockStatus, BlockId}
+
+
+/**
+ * A [[MemoryManager]] that enforces a soft boundary between execution and storage such that
+ * either side can borrow memory from the other.
+ *
+ * The region shared between execution and storage is a fraction of the total heap space
+ * configurable through `spark.memory.fraction` (default 0.75). The position of the boundary
+ * within this space is further determined by `spark.memory.storageFraction` (default 0.5).
+ * This means the size of the storage region is 0.75 * 0.5 = 0.375 of the heap space by default.
+ *
+ * Storage can borrow as much execution memory as is free until execution reclaims its space.
+ * When this happens, cached blocks will be evicted from memory until sufficient borrowed
+ * memory is released to satisfy the execution memory request.
+ *
+ * Similarly, execution can borrow as much storage memory as is free. However, execution
+ * memory is *never* evicted by storage due to the complexities involved in implementing this.
+ * The implication is that attempts to cache blocks may fail if execution has already eaten
+ * up most of the storage space, in which case the new blocks will be evicted immediately
+ * according to their respective storage levels.
+ */
+private[spark] class UnifiedMemoryManager(conf: SparkConf, maxMemory: Long) extends MemoryManager {
+
+  def this(conf: SparkConf) {
+    this(conf, UnifiedMemoryManager.getMaxMemory(conf))
+  }
+
+  /**
+   * Size of the storage region, in bytes.
+   *
+   * This region is not statically reserved; execution can borrow from it if necessary.
+   * Cached blocks can be evicted only if actual storage memory usage exceeds this region.
+   */
+  private val storageRegionSize: Long = {
+    (maxMemory * conf.getDouble("spark.memory.storageFraction", 0.5)).toLong
+  }
+
+  /**
+   * Total amount of memory, in bytes, not currently occupied by either execution or storage.
+   */
+  private def totalFreeMemory: Long = synchronized {
+    assert(_executionMemoryUsed <= maxMemory)
+    assert(_storageMemoryUsed <= maxMemory)
+    assert(_executionMemoryUsed + _storageMemoryUsed <= maxMemory)
+    maxMemory - _executionMemoryUsed - _storageMemoryUsed
+  }
+
+  /**
+   * Total available memory for execution, in bytes.
+   * In this model, this is equivalent to the amount of memory not occupied by storage.
+   */
+  override def maxExecutionMemory: Long = synchronized {
+    maxMemory - _storageMemoryUsed
+  }
+
+  /**
+   * Total available memory for storage, in bytes.
+   * In this model, this is equivalent to the amount of memory not occupied by execution.
+   */
+  override def maxStorageMemory: Long = synchronized {
+    maxMemory - _executionMemoryUsed
+  }
+
+  /**
+   * Acquire N bytes of memory for execution, evicting cached blocks if necessary.
+   *
+   * This method evicts blocks only up to the amount of memory borrowed by storage.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return number of bytes successfully granted (<= N).
+   */
+  override def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = synchronized {
+    assert(numBytes >= 0)
+    val memoryBorrowedByStorage = math.max(0, _storageMemoryUsed - storageRegionSize)
+    // If there is not enough free memory AND storage has borrowed some execution memory,
+    // then evict as much memory borrowed by storage as needed to grant this request
+    val shouldEvictStorage = totalFreeMemory < numBytes && memoryBorrowedByStorage > 0
+    if (shouldEvictStorage) {
+      val spaceToEnsure = math.min(numBytes, memoryBorrowedByStorage)
+      memoryStore.ensureFreeSpace(spaceToEnsure, evictedBlocks)
+    }
+    val bytesToGrant = math.min(numBytes, totalFreeMemory)
+    _executionMemoryUsed += bytesToGrant
+    bytesToGrant
+  }
+
+  /**
+   * Acquire N bytes of memory to cache the given block, evicting existing ones if necessary.
+   * Blocks evicted in the process, if any, are added to `evictedBlocks`.
+   * @return whether all N bytes were successfully granted.
+   */
+  override def acquireStorageMemory(
+      blockId: BlockId,
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = synchronized {
+    assert(numBytes >= 0)
+    memoryStore.ensureFreeSpace(blockId, numBytes, evictedBlocks)
+    val enoughMemory = totalFreeMemory >= numBytes
+    if (enoughMemory) {
+      _storageMemoryUsed += numBytes
+    }
+    enoughMemory
+  }
+
+}
+
+private object UnifiedMemoryManager {
+
+  /**
+   * Return the total amount of memory shared between execution and storage, in bytes.
+   */
+  private def getMaxMemory(conf: SparkConf): Long = {
+    val systemMaxMemory = conf.getLong("spark.testing.memory", Runtime.getRuntime.maxMemory)
+    val memoryFraction = conf.getDouble("spark.memory.fraction", 0.75)
+    (systemMaxMemory * memoryFraction).toLong
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index bb64bb3f35df0..aaf543ce9232a 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -18,11 +18,13 @@
 package org.apache.spark.shuffle
 
 import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.annotations.VisibleForTesting
 
 import org.apache.spark._
 import org.apache.spark.memory.{StaticMemoryManager, MemoryManager}
+import org.apache.spark.storage.{BlockId, BlockStatus}
 import org.apache.spark.unsafe.array.ByteArrayMethods
 
 /**
@@ -36,8 +38,8 @@ import org.apache.spark.unsafe.array.ByteArrayMethods
  * If there are N tasks, it ensures that each tasks can acquire at least 1 / 2N of the memory
  * before it has to spill, and at most 1 / N. Because N varies dynamically, we keep track of the
  * set of active tasks and redo the calculations of 1 / 2N and 1 / N in waiting tasks whenever
- * this set changes. This is all done by synchronizing access on "this" to mutate state and using
- * wait() and notifyAll() to signal changes.
+ * this set changes. This is all done by synchronizing access to `memoryManager` to mutate state
+ * and using wait() and notifyAll() to signal changes.
  *
  * Use `ShuffleMemoryManager.create()` factory method to create a new instance.
  *
@@ -51,7 +53,6 @@ class ShuffleMemoryManager protected (
   extends Logging {
 
   private val taskMemory = new mutable.HashMap[Long, Long]()  // taskAttemptId -> memory bytes
-  private val maxMemory = memoryManager.maxExecutionMemory
 
   private def currentTaskAttemptId(): Long = {
     // In case this is called on the driver, return an invalid task attempt id.
@@ -65,7 +66,7 @@ class ShuffleMemoryManager protected (
    * total memory pool (where N is the # of active tasks) before it is forced to spill. This can
    * happen if the number of tasks increases but an older task had a lot of memory already.
    */
-  def tryToAcquire(numBytes: Long): Long = synchronized {
+  def tryToAcquire(numBytes: Long): Long = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     assert(numBytes > 0, "invalid number of bytes requested: " + numBytes)
 
@@ -73,15 +74,18 @@ class ShuffleMemoryManager protected (
     // of active tasks, to let other tasks ramp down their memory in calls to tryToAcquire
     if (!taskMemory.contains(taskAttemptId)) {
       taskMemory(taskAttemptId) = 0L
-      notifyAll()  // Will later cause waiting tasks to wake up and check numTasks again
+      // This will later cause waiting tasks to wake up and check numTasks again
+      memoryManager.notifyAll()
     }
 
     // Keep looping until we're either sure that we don't want to grant this request (because this
     // task would have more than 1 / numActiveTasks of the memory) or we have enough free
     // memory to give it (we always let each task get at least 1 / (2 * numActiveTasks)).
+    // TODO: simplify this to limit each task to its own slot
     while (true) {
       val numActiveTasks = taskMemory.keys.size
       val curMem = taskMemory(taskAttemptId)
+      val maxMemory = memoryManager.maxExecutionMemory
       val freeMemory = maxMemory - taskMemory.values.sum
 
       // How much we can grant this task; don't let it grow to more than 1 / numActiveTasks;
@@ -99,7 +103,7 @@ class ShuffleMemoryManager protected (
         } else {
           logInfo(
             s"TID $taskAttemptId waiting for at least 1/2N of shuffle memory pool to be free")
-          wait()
+          memoryManager.wait()
         }
       } else {
         return acquire(toGrant)
@@ -112,15 +116,23 @@ class ShuffleMemoryManager protected (
    * Acquire N bytes of execution memory from the memory manager for the current task.
    * @return number of bytes actually acquired (<= N).
    */
-  private def acquire(numBytes: Long): Long = synchronized {
+  private def acquire(numBytes: Long): Long = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
-    val acquired = memoryManager.acquireExecutionMemory(numBytes)
+    val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+    val acquired = memoryManager.acquireExecutionMemory(numBytes, evictedBlocks)
+    // Register evicted blocks, if any, with the active task metrics
+    // TODO: just do this in `acquireExecutionMemory` (SPARK-10985)
+    Option(TaskContext.get()).foreach { tc =>
+      val metrics = tc.taskMetrics()
+      val lastUpdatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]())
+      metrics.updatedBlocks = Some(lastUpdatedBlocks ++ evictedBlocks.toSeq)
+    }
     taskMemory(taskAttemptId) += acquired
     acquired
   }
 
   /** Release numBytes bytes for the current task. */
-  def release(numBytes: Long): Unit = synchronized {
+  def release(numBytes: Long): Unit = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     val curMem = taskMemory.getOrElse(taskAttemptId, 0L)
     if (curMem < numBytes) {
@@ -129,20 +141,20 @@ class ShuffleMemoryManager protected (
     }
     taskMemory(taskAttemptId) -= numBytes
     memoryManager.releaseExecutionMemory(numBytes)
-    notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
+    memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
   }
 
   /** Release all memory for the current task and mark it as inactive (e.g. when a task ends). */
-  def releaseMemoryForThisTask(): Unit = synchronized {
+  def releaseMemoryForThisTask(): Unit = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     taskMemory.remove(taskAttemptId).foreach { numBytes =>
       memoryManager.releaseExecutionMemory(numBytes)
     }
-    notifyAll()  // Notify waiters who locked "this" in tryToAcquire that memory has been freed
+    memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
   }
 
   /** Returns the memory consumption, in bytes, for the current task */
-  def getMemoryConsumptionForThisTask(): Long = synchronized {
+  def getMemoryConsumptionForThisTask(): Long = memoryManager.synchronized {
     val taskAttemptId = currentTaskAttemptId()
     taskMemory.getOrElse(taskAttemptId, 0L)
   }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 9f5bd2abbdc5d..c374b93766225 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -91,6 +91,10 @@ private[spark] class BlockManager(
   }
   memoryManager.setMemoryStore(memoryStore)
 
+  // Note: depending on the memory manager, `maxStorageMemory` may actually vary over time.
+  // However, since we use this only for reporting and logging, what we actually want here is
+  // the absolute maximum value that `maxStorageMemory` can ever possibly reach. We may need
+  // to revisit whether reporting this value as the "max" is intuitive to the user.
   private val maxMemory = memoryManager.maxStorageMemory
 
   private[spark]
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index 35c57b923c43a..4dbac388e098b 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -37,15 +37,14 @@ private case class MemoryEntry(value: Any, size: Long, deserialized: Boolean)
 private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: MemoryManager)
   extends BlockStore(blockManager) {
 
+  // Note: all changes to memory allocations, notably putting blocks, evicting blocks, and
+  // acquiring or releasing unroll memory, must be synchronized on `memoryManager`!
+
   private val conf = blockManager.conf
   private val entries = new LinkedHashMap[BlockId, MemoryEntry](32, 0.75f, true)
-  private val maxMemory = memoryManager.maxStorageMemory
-
-  // Ensure only one thread is putting, and if necessary, dropping blocks at any given time
-  private val accountingLock = new Object
 
   // A mapping from taskAttemptId to amount of memory used for unrolling a block (in bytes)
-  // All accesses of this map are assumed to have manually synchronized on `accountingLock`
+  // All accesses of this map are assumed to have manually synchronized on `memoryManager`
   private val unrollMemoryMap = mutable.HashMap[Long, Long]()
   // Same as `unrollMemoryMap`, but for pending unroll memory as defined below.
   // Pending unroll memory refers to the intermediate memory occupied by a task
@@ -60,6 +59,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
   private val unrollMemoryThreshold: Long =
     conf.getLong("spark.storage.unrollMemoryThreshold", 1024 * 1024)
 
+  /** Total amount of memory available for storage, in bytes. */
+  private def maxMemory: Long = memoryManager.maxStorageMemory
+
   if (maxMemory < unrollMemoryThreshold) {
     logWarning(s"Max memory ${Utils.bytesToString(maxMemory)} is less than the initial memory " +
       s"threshold ${Utils.bytesToString(unrollMemoryThreshold)} needed to store a block in " +
@@ -75,7 +77,9 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    * Amount of storage memory, in bytes, used for caching blocks.
    * This does not include memory used for unrolling.
    */
-  private def blocksMemoryUsed: Long = memoryUsed - currentUnrollMemory
+  private def blocksMemoryUsed: Long = memoryManager.synchronized {
+    memoryUsed - currentUnrollMemory
+  }
 
   override def getSize(blockId: BlockId): Long = {
     entries.synchronized {
@@ -208,7 +212,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     }
   }
 
-  override def remove(blockId: BlockId): Boolean = {
+  override def remove(blockId: BlockId): Boolean = memoryManager.synchronized {
     val entry = entries.synchronized { entries.remove(blockId) }
     if (entry != null) {
       memoryManager.releaseStorageMemory(entry.size)
@@ -220,11 +224,13 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     }
   }
 
-  override def clear() {
+  override def clear(): Unit = memoryManager.synchronized {
     entries.synchronized {
       entries.clear()
     }
-    memoryManager.releaseStorageMemory()
+    unrollMemoryMap.clear()
+    pendingUnrollMemoryMap.clear()
+    memoryManager.releaseAllStorageMemory()
     logInfo("MemoryStore cleared")
   }
 
@@ -299,22 +305,23 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
       }
 
     } finally {
-      // If we return an array, the values returned will later be cached in `tryToPut`.
-      // In this case, we should release the memory after we cache the block there.
-      // Otherwise, if we return an iterator, we release the memory reserved here
-      // later when the task finishes.
+      // If we return an array, the values returned here will be cached in `tryToPut` later.
+      // In this case, we should release the memory only after we cache the block there.
       if (keepUnrolling) {
         val taskAttemptId = currentTaskAttemptId()
-        accountingLock.synchronized {
-          // Here, we transfer memory from unroll to pending unroll because we expect to cache this
-          // block in `tryToPut`. We do not release and re-acquire memory from the MemoryManager in
-          // order to avoid race conditions where another component steals the memory that we're
-          // trying to transfer.
+        memoryManager.synchronized {
+          // Since we continue to hold onto the array until we actually cache it, we cannot
+          // release the unroll memory yet. Instead, we transfer it to pending unroll memory
+          // so `tryToPut` can further transfer it to normal storage memory later.
+          // TODO: we can probably express this without pending unroll memory (SPARK-10907)
           val amountToTransferToPending = currentUnrollMemoryForThisTask - previousMemoryReserved
           unrollMemoryMap(taskAttemptId) -= amountToTransferToPending
           pendingUnrollMemoryMap(taskAttemptId) =
             pendingUnrollMemoryMap.getOrElse(taskAttemptId, 0L) + amountToTransferToPending
         }
+      } else {
+        // Otherwise, if we return an iterator, we can only release the unroll memory when
+        // the task finishes since we don't know when the iterator will be consumed.
       }
     }
   }
@@ -343,7 +350,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    * `value` will be lazily created. If it cannot be put into MemoryStore or disk, `value` won't be
    * created to avoid OOM since it may be a big ByteBuffer.
    *
-   * Synchronize on `accountingLock` to ensure that all the put requests and its associated block
+   * Synchronize on `memoryManager` to ensure that all the put requests and its associated block
    * dropping is done by only on thread at a time. Otherwise while one thread is dropping
    * blocks to free memory for one block, another thread may use up the freed space for
    * another block.
@@ -365,16 +372,13 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
      * for freeing up more space for another block that needs to be put. Only then the actually
      * dropping of blocks (and writing to disk if necessary) can proceed in parallel. */
 
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       // Note: if we have previously unrolled this block successfully, then pending unroll
       // memory should be non-zero. This is the amount that we already reserved during the
       // unrolling process. In this case, we can just reuse this space to cache our block.
-      //
-      // Note: the StaticMemoryManager counts unroll memory as storage memory. Here, the
-      // synchronization on `accountingLock` guarantees that the release of unroll memory and
-      // acquisition of storage memory happens atomically. However, if storage memory is acquired
-      // outside of MemoryStore or if unroll memory is counted as execution memory, then we will
-      // have to revisit this assumption. See SPARK-10983 for more context.
+      // The synchronization on `memoryManager` here guarantees that the release and acquire
+      // happen atomically. This relies on the assumption that all memory acquisitions are
+      // synchronized on the same lock.
       releasePendingUnrollMemoryForThisTask()
       val enoughMemory = memoryManager.acquireStorageMemory(blockId, size, droppedBlocks)
       if (enoughMemory) {
@@ -401,34 +405,62 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
     }
   }
 
+  /**
+   * Try to free up a given amount of space by evicting existing blocks.
+   *
+   * @param space the amount of memory to free, in bytes
+   * @param droppedBlocks a holder for blocks evicted in the process
+   * @return whether the requested free space is freed.
+   */
+  private[spark] def ensureFreeSpace(
+      space: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    ensureFreeSpace(None, space, droppedBlocks)
+  }
+
+  /**
+   * Try to free up a given amount of space to store a block by evicting existing ones.
+   *
+   * @param space the amount of memory to free, in bytes
+   * @param droppedBlocks a holder for blocks evicted in the process
+   * @return whether the requested free space is freed.
+   */
+  private[spark] def ensureFreeSpace(
+      blockId: BlockId,
+      space: Long,
+      droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
+    ensureFreeSpace(Some(blockId), space, droppedBlocks)
+  }
+
   /**
    * Try to free up a given amount of space to store a particular block, but can fail if
    * either the block is bigger than our memory or it would require replacing another block
    * from the same RDD (which leads to a wasteful cyclic replacement pattern for RDDs that
    * don't fit into memory that we want to avoid).
    *
-   * @param blockId the ID of the block we are freeing space for
+   * @param blockId the ID of the block we are freeing space for, if any
    * @param space the size of this block
    * @param droppedBlocks a holder for blocks evicted in the process
-   * @return whether there is enough free space.
+   * @return whether the requested free space is freed.
    */
-  private[spark] def ensureFreeSpace(
-      blockId: BlockId,
+  private def ensureFreeSpace(
+      blockId: Option[BlockId],
       space: Long,
       droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       val freeMemory = maxMemory - memoryUsed
-      val rddToAdd = getRddId(blockId)
+      val rddToAdd = blockId.flatMap(getRddId)
       val selectedBlocks = new ArrayBuffer[BlockId]
       var selectedMemory = 0L
 
-      logInfo(s"Ensuring $space bytes of free space for block $blockId " +
+      logInfo(s"Ensuring $space bytes of free space " +
+        blockId.map { id => s"for block $id" }.getOrElse("") +
         s"(free: $freeMemory, max: $maxMemory)")
 
       // Fail fast if the block simply won't fit
       if (space > maxMemory) {
-        logInfo(s"Will not store $blockId as the required space " +
-          s"($space bytes) than our memory limit ($maxMemory bytes)")
+        logInfo("Will not " + blockId.map { id => s"store $id" }.getOrElse("free memory") +
+          s" as the required space ($space bytes) exceeds our memory limit ($maxMemory bytes)")
         return false
       }
 
@@ -471,8 +503,10 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
         }
         true
       } else {
-        logInfo(s"Will not store $blockId as it would require dropping another block " +
-          "from the same RDD")
+        blockId.foreach { id =>
+          logInfo(s"Will not store $id as it would require dropping another block " +
+            "from the same RDD")
+        }
         false
       }
     }
@@ -495,8 +529,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
       blockId: BlockId,
       memory: Long,
       droppedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = {
-    accountingLock.synchronized {
-      // Note: all acquisitions of unroll memory must be synchronized on `accountingLock`
+    memoryManager.synchronized {
       val success = memoryManager.acquireUnrollMemory(blockId, memory, droppedBlocks)
       if (success) {
         val taskAttemptId = currentTaskAttemptId()
@@ -512,7 +545,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    */
   def releaseUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       if (unrollMemoryMap.contains(taskAttemptId)) {
         val memoryToRelease = math.min(memory, unrollMemoryMap(taskAttemptId))
         if (memoryToRelease > 0) {
@@ -531,7 +564,7 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
    */
   def releasePendingUnrollMemoryForThisTask(memory: Long = Long.MaxValue): Unit = {
     val taskAttemptId = currentTaskAttemptId()
-    accountingLock.synchronized {
+    memoryManager.synchronized {
       if (pendingUnrollMemoryMap.contains(taskAttemptId)) {
         val memoryToRelease = math.min(memory, pendingUnrollMemoryMap(taskAttemptId))
         if (memoryToRelease > 0) {
@@ -548,21 +581,21 @@ private[spark] class MemoryStore(blockManager: BlockManager, memoryManager: Memo
   /**
    * Return the amount of memory currently occupied for unrolling blocks across all tasks.
    */
-  def currentUnrollMemory: Long = accountingLock.synchronized {
+  def currentUnrollMemory: Long = memoryManager.synchronized {
     unrollMemoryMap.values.sum + pendingUnrollMemoryMap.values.sum
   }
 
   /**
    * Return the amount of memory currently occupied for unrolling blocks by this task.
    */
-  def currentUnrollMemoryForThisTask: Long = accountingLock.synchronized {
+  def currentUnrollMemoryForThisTask: Long = memoryManager.synchronized {
     unrollMemoryMap.getOrElse(currentTaskAttemptId(), 0L)
   }
 
   /**
    * Return the number of tasks currently unrolling blocks.
    */
-  private def numTasksUnrolling: Int = accountingLock.synchronized { unrollMemoryMap.keys.size }
+  private def numTasksUnrolling: Int = memoryManager.synchronized { unrollMemoryMap.keys.size }
 
   /**
    * Log information about current memory usage.
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 29c5732f5a8c1..6a96b5dc12684 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -48,16 +48,6 @@ import org.apache.spark.executor.ShuffleWriteMetrics
  * However, if the spill threshold is too low, we spill frequently and incur unnecessary disk
  * writes. This may lead to a performance regression compared to the normal case of using the
  * non-spilling AppendOnlyMap.
- *
- * Two parameters control the memory threshold:
- *
- *   `spark.shuffle.memoryFraction` specifies the collective amount of memory used for storing
- *   these maps as a fraction of the executor's total memory. Since each concurrently running
- *   task maintains one map, the actual threshold for each map is this quantity divided by the
- *   number of running tasks.
- *
- *   `spark.shuffle.safetyFraction` specifies an additional margin of safety as a fraction of
- *   this threshold, in case map size estimation is not sufficiently accurate.
  */
 @DeveloperApi
 class ExternalAppendOnlyMap[K, V, C](
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 600c1403b0344..34a4bb968e732 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -213,11 +213,8 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
   }
 
   test("compute when only some partitions fit in memory") {
-    val conf = new SparkConf().set("spark.storage.memoryFraction", "0.01")
-    sc = new SparkContext(clusterUrl, "test", conf)
-    // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache
-    // to only 5 MB (0.01 of 512 MB), so not all of it will fit in memory; we use 20 partitions
-    // to make sure that *some* of them do fit though
+    sc = new SparkContext(clusterUrl, "test", new SparkConf)
+    // TODO: verify that only a subset of partitions fit in memory (SPARK-11078)
     val data = sc.parallelize(1 to 4000000, 20).persist(StorageLevel.MEMORY_ONLY_SER)
     assert(data.count() === 4000000)
     assert(data.count() === 4000000)
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index d91b799ecfc08..4a0877d86f2c6 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -247,11 +247,13 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalSparkC
         .setMaster("local")
         .set("spark.shuffle.spill.compress", shuffleSpillCompress.toString)
         .set("spark.shuffle.compress", shuffleCompress.toString)
-        .set("spark.shuffle.memoryFraction", "0.001")
       resetSparkContext()
       sc = new SparkContext(myConf)
+      val diskBlockManager = sc.env.blockManager.diskBlockManager
       try {
-        sc.parallelize(0 until 100000).map(i => (i / 4, i)).groupByKey().collect()
+        assert(diskBlockManager.getAllFiles().isEmpty)
+        sc.parallelize(0 until 10).map(i => (i / 4, i)).groupByKey().collect()
+        assert(diskBlockManager.getAllFiles().nonEmpty)
       } catch {
         case e: Exception =>
           val errMsg = s"Failed with spark.shuffle.spill.compress=$shuffleSpillCompress," +
diff --git a/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
new file mode 100644
index 0000000000000..36e4566310715
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/memory/MemoryManagerSuite.scala
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import java.util.concurrent.atomic.AtomicLong
+
+import org.mockito.Matchers.{any, anyLong}
+import org.mockito.Mockito.{mock, when}
+import org.mockito.invocation.InvocationOnMock
+import org.mockito.stubbing.Answer
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.storage.MemoryStore
+
+
+/**
+ * Helper trait for sharing code among [[MemoryManager]] tests.
+ */
+private[memory] trait MemoryManagerSuite extends SparkFunSuite {
+
+  import MemoryManagerSuite.DEFAULT_ENSURE_FREE_SPACE_CALLED
+
+  // Note: Mockito's verify mechanism does not provide a way to reset method call counts
+  // without also resetting stubbed methods. Since our test code relies on the latter,
+  // we need to use our own variable to track invocations of `ensureFreeSpace`.
+
+  /**
+   * The amount of free space requested in the last call to [[MemoryStore.ensureFreeSpace]]
+   *
+   * This set whenever [[MemoryStore.ensureFreeSpace]] is called, and cleared when the test
+   * code makes explicit assertions on this variable through [[assertEnsureFreeSpaceCalled]].
+   */
+  private val ensureFreeSpaceCalled = new AtomicLong(DEFAULT_ENSURE_FREE_SPACE_CALLED)
+
+  /**
+   * Make a mocked [[MemoryStore]] whose [[MemoryStore.ensureFreeSpace]] method is stubbed.
+   *
+   * This allows our test code to release storage memory when [[MemoryStore.ensureFreeSpace]]
+   * is called without relying on [[org.apache.spark.storage.BlockManager]] and all of its
+   * dependencies.
+   */
+  protected def makeMemoryStore(mm: MemoryManager): MemoryStore = {
+    val ms = mock(classOf[MemoryStore])
+    when(ms.ensureFreeSpace(anyLong(), any())).thenAnswer(ensureFreeSpaceAnswer(mm, 0))
+    when(ms.ensureFreeSpace(any(), anyLong(), any())).thenAnswer(ensureFreeSpaceAnswer(mm, 1))
+    mm.setMemoryStore(ms)
+    ms
+  }
+
+  /**
+   * Make an [[Answer]] that stubs [[MemoryStore.ensureFreeSpace]] with the right arguments.
+   */
+  private def ensureFreeSpaceAnswer(mm: MemoryManager, numBytesPos: Int): Answer[Boolean] = {
+    new Answer[Boolean] {
+      override def answer(invocation: InvocationOnMock): Boolean = {
+        val args = invocation.getArguments
+        require(args.size > numBytesPos, s"bad test: expected >$numBytesPos arguments " +
+          s"in ensureFreeSpace, found ${args.size}")
+        require(args(numBytesPos).isInstanceOf[Long], s"bad test: expected ensureFreeSpace " +
+          s"argument at index $numBytesPos to be a Long: ${args.mkString(", ")}")
+        val numBytes = args(numBytesPos).asInstanceOf[Long]
+        mockEnsureFreeSpace(mm, numBytes)
+      }
+    }
+  }
+
+  /**
+   * Simulate the part of [[MemoryStore.ensureFreeSpace]] that releases storage memory.
+   *
+   * This is a significant simplification of the real method, which actually drops existing
+   * blocks based on the size of each block. Instead, here we simply release as many bytes
+   * as needed to ensure the requested amount of free space. This allows us to set up the
+   * test without relying on the [[org.apache.spark.storage.BlockManager]], which brings in
+   * many other dependencies.
+   *
+   * Every call to this method will set a global variable, [[ensureFreeSpaceCalled]], that
+   * records the number of bytes this is called with. This variable is expected to be cleared
+   * by the test code later through [[assertEnsureFreeSpaceCalled]].
+   */
+  private def mockEnsureFreeSpace(mm: MemoryManager, numBytes: Long): Boolean = mm.synchronized {
+    require(ensureFreeSpaceCalled.get() === DEFAULT_ENSURE_FREE_SPACE_CALLED,
+      "bad test: ensure free space variable was not reset")
+    // Record the number of bytes we freed this call
+    ensureFreeSpaceCalled.set(numBytes)
+    if (numBytes <= mm.maxStorageMemory) {
+      def freeMemory = mm.maxStorageMemory - mm.storageMemoryUsed
+      val spaceToRelease = numBytes - freeMemory
+      if (spaceToRelease > 0) {
+        mm.releaseStorageMemory(spaceToRelease)
+      }
+      freeMemory >= numBytes
+    } else {
+      // We attempted to free more bytes than our max allowable memory
+      false
+    }
+  }
+
+  /**
+   * Assert that [[MemoryStore.ensureFreeSpace]] is called with the given parameters.
+   */
+  protected def assertEnsureFreeSpaceCalled(ms: MemoryStore, numBytes: Long): Unit = {
+    assert(ensureFreeSpaceCalled.get() === numBytes,
+      s"expected ensure free space to be called with $numBytes")
+    ensureFreeSpaceCalled.set(DEFAULT_ENSURE_FREE_SPACE_CALLED)
+  }
+
+  /**
+   * Assert that [[MemoryStore.ensureFreeSpace]] is NOT called.
+   */
+  protected def assertEnsureFreeSpaceNotCalled[T](ms: MemoryStore): Unit = {
+    assert(ensureFreeSpaceCalled.get() === DEFAULT_ENSURE_FREE_SPACE_CALLED,
+      "ensure free space should not have been called!")
+  }
+}
+
+private object MemoryManagerSuite {
+  private val DEFAULT_ENSURE_FREE_SPACE_CALLED = -1L
+}
diff --git a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
index c436a8b5c9f81..6cae1f871e24b 100644
--- a/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/memory/StaticMemoryManagerSuite.scala
@@ -19,32 +19,44 @@ package org.apache.spark.memory
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.mockito.Mockito.{mock, reset, verify, when}
-import org.mockito.Matchers.{any, eq => meq}
+import org.mockito.Mockito.when
 
+import org.apache.spark.SparkConf
 import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
-import org.apache.spark.{SparkConf, SparkFunSuite}
 
 
-class StaticMemoryManagerSuite extends SparkFunSuite {
+class StaticMemoryManagerSuite extends MemoryManagerSuite {
   private val conf = new SparkConf().set("spark.storage.unrollFraction", "0.4")
+  private val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+
+  /**
+   * Make a [[StaticMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
+   */
+  private def makeThings(
+      maxExecutionMem: Long,
+      maxStorageMem: Long): (StaticMemoryManager, MemoryStore) = {
+    val mm = new StaticMemoryManager(
+      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem)
+    val ms = makeMemoryStore(mm)
+    (mm, ms)
+  }
 
   test("basic execution memory") {
     val maxExecutionMem = 1000L
     val (mm, _) = makeThings(maxExecutionMem, Long.MaxValue)
     assert(mm.executionMemoryUsed === 0L)
-    assert(mm.acquireExecutionMemory(10L) === 10L)
+    assert(mm.acquireExecutionMemory(10L, evictedBlocks) === 10L)
     assert(mm.executionMemoryUsed === 10L)
-    assert(mm.acquireExecutionMemory(100L) === 100L)
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
     // Acquire up to the max
-    assert(mm.acquireExecutionMemory(1000L) === 890L)
+    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 890L)
     assert(mm.executionMemoryUsed === maxExecutionMem)
-    assert(mm.acquireExecutionMemory(1L) === 0L)
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 0L)
     assert(mm.executionMemoryUsed === maxExecutionMem)
     mm.releaseExecutionMemory(800L)
     assert(mm.executionMemoryUsed === 200L)
     // Acquire after release
-    assert(mm.acquireExecutionMemory(1L) === 1L)
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 1L)
     assert(mm.executionMemoryUsed === 201L)
     // Release beyond what was acquired
     mm.releaseExecutionMemory(maxExecutionMem)
@@ -54,37 +66,36 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
   test("basic storage memory") {
     val maxStorageMem = 1000L
     val dummyBlock = TestBlockId("you can see the world you brought to live")
-    val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
     // `ensureFreeSpace` should be called with the number of bytes requested
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 10L)
+    assertEnsureFreeSpaceCalled(ms, 10L)
     assert(mm.storageMemoryUsed === 10L)
-    assert(evictedBlocks.isEmpty)
     assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assertEnsureFreeSpaceCalled(ms, 100L)
     assert(mm.storageMemoryUsed === 110L)
-    // Acquire up to the max, not granted
-    assert(!mm.acquireStorageMemory(dummyBlock, 1000L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1000L)
+    // Acquire more than the max, not granted
+    assert(!mm.acquireStorageMemory(dummyBlock, maxStorageMem + 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, maxStorageMem + 1L)
     assert(mm.storageMemoryUsed === 110L)
-    assert(mm.acquireStorageMemory(dummyBlock, 890L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 890L)
+    // Acquire up to the max, requests after this are still granted due to LRU eviction
+    assert(mm.acquireStorageMemory(dummyBlock, maxStorageMem, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1000L)
     assert(mm.storageMemoryUsed === 1000L)
-    assert(!mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
     assert(mm.storageMemoryUsed === 1000L)
     mm.releaseStorageMemory(800L)
     assert(mm.storageMemoryUsed === 200L)
     // Acquire after release
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assertEnsureFreeSpaceCalled(ms, 1L)
     assert(mm.storageMemoryUsed === 201L)
-    mm.releaseStorageMemory()
+    mm.releaseAllStorageMemory()
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 1L)
+    assertEnsureFreeSpaceCalled(ms, 1L)
     assert(mm.storageMemoryUsed === 1L)
     // Release beyond what was acquired
     mm.releaseStorageMemory(100L)
@@ -95,18 +106,17 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
     val maxExecutionMem = 200L
     val maxStorageMem = 1000L
     val dummyBlock = TestBlockId("ain't nobody love like you do")
-    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     val (mm, ms) = makeThings(maxExecutionMem, maxStorageMem)
     // Only execution memory should increase
-    assert(mm.acquireExecutionMemory(100L) === 100L)
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 100L)
-    assert(mm.acquireExecutionMemory(1000L) === 100L)
+    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 100L)
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 200L)
     // Only storage memory should increase
-    assert(mm.acquireStorageMemory(dummyBlock, 50L, dummyBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 50L)
+    assert(mm.acquireStorageMemory(dummyBlock, 50L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 50L)
     assert(mm.storageMemoryUsed === 50L)
     assert(mm.executionMemoryUsed === 200L)
     // Only execution memory should be released
@@ -114,7 +124,7 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
     assert(mm.storageMemoryUsed === 50L)
     assert(mm.executionMemoryUsed === 67L)
     // Only storage memory should be released
-    mm.releaseStorageMemory()
+    mm.releaseAllStorageMemory()
     assert(mm.storageMemoryUsed === 0L)
     assert(mm.executionMemoryUsed === 67L)
   }
@@ -122,51 +132,26 @@ class StaticMemoryManagerSuite extends SparkFunSuite {
   test("unroll memory") {
     val maxStorageMem = 1000L
     val dummyBlock = TestBlockId("lonely water")
-    val dummyBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
     val (mm, ms) = makeThings(Long.MaxValue, maxStorageMem)
-    assert(mm.acquireUnrollMemory(dummyBlock, 100L, dummyBlocks))
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 100L)
+    assert(mm.acquireUnrollMemory(dummyBlock, 100L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 100L)
     assert(mm.storageMemoryUsed === 100L)
     mm.releaseUnrollMemory(40L)
     assert(mm.storageMemoryUsed === 60L)
     when(ms.currentUnrollMemory).thenReturn(60L)
-    assert(mm.acquireUnrollMemory(dummyBlock, 500L, dummyBlocks))
+    assert(mm.acquireUnrollMemory(dummyBlock, 500L, evictedBlocks))
     // `spark.storage.unrollFraction` is 0.4, so the max unroll space is 400 bytes.
     // Since we already occupy 60 bytes, we will try to ensure only 400 - 60 = 340 bytes.
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 340L)
+    assertEnsureFreeSpaceCalled(ms, 340L)
     assert(mm.storageMemoryUsed === 560L)
     when(ms.currentUnrollMemory).thenReturn(560L)
-    assert(!mm.acquireUnrollMemory(dummyBlock, 800L, dummyBlocks))
+    assert(!mm.acquireUnrollMemory(dummyBlock, 800L, evictedBlocks))
     assert(mm.storageMemoryUsed === 560L)
     // We already have 560 bytes > the max unroll space of 400 bytes, so no bytes are freed
-    assertEnsureFreeSpaceCalled(ms, dummyBlock, 0L)
+    assertEnsureFreeSpaceCalled(ms, 0L)
     // Release beyond what was acquired
     mm.releaseUnrollMemory(maxStorageMem)
     assert(mm.storageMemoryUsed === 0L)
   }
 
-  /**
-   * Make a [[StaticMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
-   */
-  private def makeThings(
-      maxExecutionMem: Long,
-      maxStorageMem: Long): (StaticMemoryManager, MemoryStore) = {
-    val mm = new StaticMemoryManager(
-      conf, maxExecutionMemory = maxExecutionMem, maxStorageMemory = maxStorageMem)
-    val ms = mock(classOf[MemoryStore])
-    mm.setMemoryStore(ms)
-    (mm, ms)
-  }
-
-  /**
-   * Assert that [[MemoryStore.ensureFreeSpace]] is called with the given parameters.
-   */
-  private def assertEnsureFreeSpaceCalled(
-      ms: MemoryStore,
-      blockId: BlockId,
-      numBytes: Long): Unit = {
-    verify(ms).ensureFreeSpace(meq(blockId), meq(numBytes: java.lang.Long), any())
-    reset(ms)
-  }
-
 }
diff --git a/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
new file mode 100644
index 0000000000000..e7baa50dc2cd0
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/memory/UnifiedMemoryManagerSuite.scala
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.memory
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.scalatest.PrivateMethodTester
+
+import org.apache.spark.SparkConf
+import org.apache.spark.storage.{BlockId, BlockStatus, MemoryStore, TestBlockId}
+
+
+class UnifiedMemoryManagerSuite extends MemoryManagerSuite with PrivateMethodTester {
+  private val conf = new SparkConf().set("spark.memory.storageFraction", "0.5")
+  private val dummyBlock = TestBlockId("--")
+  private val evictedBlocks = new ArrayBuffer[(BlockId, BlockStatus)]
+
+  /**
+   * Make a [[UnifiedMemoryManager]] and a [[MemoryStore]] with limited class dependencies.
+   */
+  private def makeThings(maxMemory: Long): (UnifiedMemoryManager, MemoryStore) = {
+    val mm = new UnifiedMemoryManager(conf, maxMemory)
+    val ms = makeMemoryStore(mm)
+    (mm, ms)
+  }
+
+  private def getStorageRegionSize(mm: UnifiedMemoryManager): Long = {
+    mm invokePrivate PrivateMethod[Long]('storageRegionSize)()
+  }
+
+  test("storage region size") {
+    val maxMemory = 1000L
+    val (mm, _) = makeThings(maxMemory)
+    val storageFraction = conf.get("spark.memory.storageFraction").toDouble
+    val expectedStorageRegionSize = maxMemory * storageFraction
+    val actualStorageRegionSize = getStorageRegionSize(mm)
+    assert(expectedStorageRegionSize === actualStorageRegionSize)
+  }
+
+  test("basic execution memory") {
+    val maxMemory = 1000L
+    val (mm, _) = makeThings(maxMemory)
+    assert(mm.executionMemoryUsed === 0L)
+    assert(mm.acquireExecutionMemory(10L, evictedBlocks) === 10L)
+    assert(mm.executionMemoryUsed === 10L)
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    // Acquire up to the max
+    assert(mm.acquireExecutionMemory(1000L, evictedBlocks) === 890L)
+    assert(mm.executionMemoryUsed === maxMemory)
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 0L)
+    assert(mm.executionMemoryUsed === maxMemory)
+    mm.releaseExecutionMemory(800L)
+    assert(mm.executionMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireExecutionMemory(1L, evictedBlocks) === 1L)
+    assert(mm.executionMemoryUsed === 201L)
+    // Release beyond what was acquired
+    mm.releaseExecutionMemory(maxMemory)
+    assert(mm.executionMemoryUsed === 0L)
+  }
+
+  test("basic storage memory") {
+    val maxMemory = 1000L
+    val (mm, ms) = makeThings(maxMemory)
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 10L, evictedBlocks))
+    // `ensureFreeSpace` should be called with the number of bytes requested
+    assertEnsureFreeSpaceCalled(ms, 10L)
+    assert(mm.storageMemoryUsed === 10L)
+    assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 100L)
+    assert(mm.storageMemoryUsed === 110L)
+    // Acquire more than the max, not granted
+    assert(!mm.acquireStorageMemory(dummyBlock, maxMemory + 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, maxMemory + 1L)
+    assert(mm.storageMemoryUsed === 110L)
+    // Acquire up to the max, requests after this are still granted due to LRU eviction
+    assert(mm.acquireStorageMemory(dummyBlock, maxMemory, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1000L)
+    assert(mm.storageMemoryUsed === 1000L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
+    assert(mm.storageMemoryUsed === 1000L)
+    mm.releaseStorageMemory(800L)
+    assert(mm.storageMemoryUsed === 200L)
+    // Acquire after release
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
+    assert(mm.storageMemoryUsed === 201L)
+    mm.releaseAllStorageMemory()
+    assert(mm.storageMemoryUsed === 0L)
+    assert(mm.acquireStorageMemory(dummyBlock, 1L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 1L)
+    assert(mm.storageMemoryUsed === 1L)
+    // Release beyond what was acquired
+    mm.releaseStorageMemory(100L)
+    assert(mm.storageMemoryUsed === 0L)
+  }
+
+  test("execution evicts storage") {
+    val maxMemory = 1000L
+    val (mm, ms) = makeThings(maxMemory)
+    // First, ensure the test classes are set up as expected
+    val expectedStorageRegionSize = 500L
+    val expectedExecutionRegionSize = 500L
+    val storageRegionSize = getStorageRegionSize(mm)
+    val executionRegionSize = maxMemory - expectedStorageRegionSize
+    require(storageRegionSize === expectedStorageRegionSize,
+      "bad test: storage region size is unexpected")
+    require(executionRegionSize === expectedExecutionRegionSize,
+      "bad test: storage region size is unexpected")
+    // Acquire enough storage memory to exceed the storage region
+    assert(mm.acquireStorageMemory(dummyBlock, 750L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 750L)
+    assert(mm.executionMemoryUsed === 0L)
+    assert(mm.storageMemoryUsed === 750L)
+    require(mm.storageMemoryUsed > storageRegionSize,
+      s"bad test: storage memory used should exceed the storage region")
+    // Execution needs to request 250 bytes to evict storage memory
+    assert(mm.acquireExecutionMemory(100L, evictedBlocks) === 100L)
+    assert(mm.executionMemoryUsed === 100L)
+    assert(mm.storageMemoryUsed === 750L)
+    assertEnsureFreeSpaceNotCalled(ms)
+    // Execution wants 200 bytes but only 150 are free, so storage is evicted
+    assert(mm.acquireExecutionMemory(200L, evictedBlocks) === 200L)
+    assertEnsureFreeSpaceCalled(ms, 200L)
+    assert(mm.executionMemoryUsed === 300L)
+    mm.releaseAllStorageMemory()
+    require(mm.executionMemoryUsed < executionRegionSize,
+      s"bad test: execution memory used should be within the execution region")
+    require(mm.storageMemoryUsed === 0, "bad test: all storage memory should have been released")
+    // Acquire some storage memory again, but this time keep it within the storage region
+    assert(mm.acquireStorageMemory(dummyBlock, 400L, evictedBlocks))
+    assertEnsureFreeSpaceCalled(ms, 400L)
+    require(mm.storageMemoryUsed < storageRegionSize,
+      s"bad test: storage memory used should be within the storage region")
+    // Execution cannot evict storage because the latter is within the storage fraction,
+    // so grant only what's remaining without evicting anything, i.e. 1000 - 300 - 400 = 300
+    assert(mm.acquireExecutionMemory(400L, evictedBlocks) === 300L)
+    assert(mm.executionMemoryUsed === 600L)
+    assert(mm.storageMemoryUsed === 400L)
+    assertEnsureFreeSpaceNotCalled(ms)
+  }
+
+  test("storage does not evict execution") {
+    val maxMemory = 1000L
+    val (mm, ms) = makeThings(maxMemory)
+    // First, ensure the test classes are set up as expected
+    val expectedStorageRegionSize = 500L
+    val expectedExecutionRegionSize = 500L
+    val storageRegionSize = getStorageRegionSize(mm)
+    val executionRegionSize = maxMemory - expectedStorageRegionSize
+    require(storageRegionSize === expectedStorageRegionSize,
+      "bad test: storage region size is unexpected")
+    require(executionRegionSize === expectedExecutionRegionSize,
+      "bad test: storage region size is unexpected")
+    // Acquire enough execution memory to exceed the execution region
+    assert(mm.acquireExecutionMemory(800L, evictedBlocks) === 800L)
+    assert(mm.executionMemoryUsed === 800L)
+    assert(mm.storageMemoryUsed === 0L)
+    assertEnsureFreeSpaceNotCalled(ms)
+    require(mm.executionMemoryUsed > executionRegionSize,
+      s"bad test: execution memory used should exceed the execution region")
+    // Storage should not be able to evict execution
+    assert(mm.acquireStorageMemory(dummyBlock, 100L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 800L)
+    assert(mm.storageMemoryUsed === 100L)
+    assertEnsureFreeSpaceCalled(ms, 100L)
+    assert(!mm.acquireStorageMemory(dummyBlock, 250L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 800L)
+    assert(mm.storageMemoryUsed === 100L)
+    assertEnsureFreeSpaceCalled(ms, 250L)
+    mm.releaseExecutionMemory(maxMemory)
+    mm.releaseStorageMemory(maxMemory)
+    // Acquire some execution memory again, but this time keep it within the execution region
+    assert(mm.acquireExecutionMemory(200L, evictedBlocks) === 200L)
+    assert(mm.executionMemoryUsed === 200L)
+    assert(mm.storageMemoryUsed === 0L)
+    assertEnsureFreeSpaceNotCalled(ms)
+    require(mm.executionMemoryUsed < executionRegionSize,
+      s"bad test: execution memory used should be within the execution region")
+    // Storage should still not be able to evict execution
+    assert(mm.acquireStorageMemory(dummyBlock, 750L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 200L)
+    assert(mm.storageMemoryUsed === 750L)
+    assertEnsureFreeSpaceCalled(ms, 750L)
+    assert(!mm.acquireStorageMemory(dummyBlock, 850L, evictedBlocks))
+    assert(mm.executionMemoryUsed === 200L)
+    assert(mm.storageMemoryUsed === 750L)
+    assertEnsureFreeSpaceCalled(ms, 850L)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
index 6d45b1a101be6..5877aa042d4af 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/ShuffleMemoryManagerSuite.scala
@@ -24,7 +24,8 @@ import org.mockito.Mockito._
 import org.scalatest.concurrent.Timeouts
 import org.scalatest.time.SpanSugar._
 
-import org.apache.spark.{SparkConf, SparkFunSuite, TaskContext}
+import org.apache.spark.{SparkFunSuite, TaskContext}
+import org.apache.spark.executor.TaskMetrics
 
 class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
 
@@ -37,7 +38,9 @@ class ShuffleMemoryManagerSuite extends SparkFunSuite with Timeouts {
         try {
           val taskAttemptId = nextTaskAttemptId.getAndIncrement
           val mockTaskContext = mock(classOf[TaskContext], RETURNS_SMART_NULLS)
+          val taskMetrics = new TaskMetrics
           when(mockTaskContext.taskAttemptId()).thenReturn(taskAttemptId)
+          when(mockTaskContext.taskMetrics()).thenReturn(taskMetrics)
           TaskContext.setTaskContext(mockTaskContext)
           body
         } finally {
diff --git a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
index 6351539e91e97..259020a2ddc34 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/unsafe/UnsafeShuffleSuite.scala
@@ -36,9 +36,6 @@ class UnsafeShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {
 
   override def beforeAll() {
     conf.set("spark.shuffle.manager", "tungsten-sort")
-    // UnsafeShuffleManager requires at least 128 MB of memory per task in order to be able to sort
-    // shuffle records.
-    conf.set("spark.shuffle.memoryFraction", "0.5")
   }
 
   test("UnsafeShuffleManager properly cleans up files for shuffles that use the new shuffle path") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 12e9bafcc92c1..0a03c32c647ae 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -22,6 +22,8 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark._
 import org.apache.spark.io.CompressionCodec
 
+// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
+
 class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   private val allCompressionCodecs = CompressionCodec.ALL_COMPRESSION_CODECS
   private def createCombiner[T](i: T) = ArrayBuffer[T](i)
@@ -243,7 +245,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
    */
   private def testSimpleSpilling(codec: Option[String] = None): Unit = {
     val conf = createSparkConf(loadDefaults = true, codec)  // Load defaults for Spark home
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     // reduceByKey - should spill ~8 times
@@ -291,7 +292,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[String]
 
@@ -340,7 +340,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with many hash collisions") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.0001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
 
@@ -365,7 +364,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions using the Int.MaxValue key") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
@@ -382,7 +380,6 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with null keys and values") {
     val conf = createSparkConf(loadDefaults = true)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
@@ -401,8 +398,8 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
 
   test("external aggregation updates peak execution memory") {
     val conf = createSparkConf(loadDefaults = false)
-      .set("spark.shuffle.memoryFraction", "0.001")
       .set("spark.shuffle.manager", "hash") // make sure we're not also using ExternalSorter
+      .set("spark.testing.memory", (10 * 1024 * 1024).toString)
     sc = new SparkContext("local", "test", conf)
     // No spilling
     AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map without spilling") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index bdb0f4d507a7e..651c7eaa65ff5 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -24,6 +24,8 @@ import scala.util.Random
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 
+// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
+
 class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
     val conf = new SparkConf(loadDefaults)
@@ -38,6 +40,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     conf.set("spark.shuffle.sort.bypassMergeThreshold", "0")
     // Ensure that we actually have multiple batches per spill file
     conf.set("spark.shuffle.spill.batchSize", "10")
+    conf.set("spark.testing.memory", "2000000")
     conf
   }
 
@@ -50,7 +53,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def emptyDataStream(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -91,7 +93,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def fewElementsPerPartition(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -140,7 +141,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def emptyPartitionsWithSpilling(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.spill.initialMemoryThreshold", "512")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
@@ -174,7 +174,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def testSpillingInLocalCluster(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
@@ -252,7 +251,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def spillingInLocalClusterWithManyReduceTasks(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
 
@@ -323,7 +321,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in sorter") {
     val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -348,7 +345,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in sorter if there are errors") {
     val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -372,7 +368,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in shuffle") {
     val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -387,7 +382,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("cleanup of intermediate files in shuffle with errors") {
     val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
     val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
@@ -416,7 +410,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def noPartialAggregationOrSorting(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -438,7 +431,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def partialAggregationWithoutSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -461,7 +453,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def partialAggregationWIthSpillNoOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -485,7 +476,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def partialAggregationWithSpillWithOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -512,7 +502,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def sortingWithoutAggregationNoSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -536,7 +525,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   def sortingWithoutAggregationWithSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
     sc = new SparkContext("local", "test", conf)
 
@@ -553,7 +541,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
@@ -610,7 +597,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with many hash collisions") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.0001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     val agg = new Aggregator[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
@@ -633,7 +619,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with hash collisions using the Int.MaxValue key") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: Int): ArrayBuffer[Int] = ArrayBuffer[Int](i)
@@ -657,7 +642,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
   test("spilling with null keys and values") {
     val conf = createSparkConf(true, false)
-    conf.set("spark.shuffle.memoryFraction", "0.001")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
@@ -693,7 +677,6 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   private def sortWithoutBreakingSortingContracts(conf: SparkConf) {
-    conf.set("spark.shuffle.memoryFraction", "0.01")
     conf.set("spark.shuffle.manager", "sort")
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
diff --git a/docs/configuration.md b/docs/configuration.md
index 154a3aee6855a..771d93be04b06 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -445,17 +445,6 @@ Apart from these, the following properties are also available, and may be useful
     met.
   </td>
 </tr>
-<tr>
-  <td><code>spark.shuffle.memoryFraction</code></td>
-  <td>0.2</td>
-  <td>
-    Fraction of Java heap to use for aggregation and cogroups during shuffles.
-    At any given time, the collective size of
-    all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
-    begin to spill to disk. If spills are often, consider increasing this value at the expense of
-    <code>spark.storage.memoryFraction</code>.
-  </td>
-</tr>
 <tr>
   <td><code>spark.shuffle.service.enabled</code></td>
   <td>false</td>
@@ -712,6 +701,76 @@ Apart from these, the following properties are also available, and may be useful
 </tr>
 </table>
 
+#### Memory Management
+<table class="table">
+<tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
+<tr>
+  <td><code>spark.memory.fraction</code></td>
+  <td>0.75</td>
+  <td>
+    Fraction of the heap space used for execution and storage. The lower this is, the more
+    frequently spills and cached data eviction occur. The purpose of this config is to set
+    aside memory for internal metadata, user data structures, and imprecise size estimation
+    in the case of sparse, unusually large records.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.memory.storageFraction</code></td>
+  <td>0.5</td>
+  <td>
+    T​he size of the storage region within the space set aside by
+    <code>s​park.memory.fraction</code>. This region is not statically reserved, but dynamically
+    allocated as cache requests come in. ​Cached data may be evicted only if total storage exceeds
+    this region.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.memory.useLegacyMode</code></td>
+  <td>false</td>
+  <td>
+    ​Whether to enable the legacy memory management mode used in Spark 1.5 and before.
+    The legacy mode rigidly partitions the heap space into fixed-size regions,
+    potentially leading to excessive spilling if the application was not tuned.
+    The following deprecated memory fraction configurations are not read unless this is enabled:
+    <code>spark.shuffle.memoryFraction</code><br>
+    <code>spark.storage.memoryFraction</code><br>
+    <code>spark.storage.unrollFraction</code>
+  </td>
+</tr>
+<tr>
+  <td><code>spark.shuffle.memoryFraction</code></td>
+  <td>0.2</td>
+  <td>
+    (deprecated) This is read only if <code>spark.memory.useLegacyMode</code> is enabled.
+    Fraction of Java heap to use for aggregation and cogroups during shuffles.
+    At any given time, the collective size of
+    all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will
+    begin to spill to disk. If spills are often, consider increasing this value at the expense of
+    <code>spark.storage.memoryFraction</code>.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.storage.memoryFraction</code></td>
+  <td>0.6</td>
+  <td>
+    (deprecated) This is read only if <code>spark.memory.useLegacyMode</code> is enabled.
+    Fraction of Java heap to use for Spark's memory cache. This should not be larger than the "old"
+    generation of objects in the JVM, which by default is given 0.6 of the heap, but you can
+    increase it if you configure your own old generation size.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.storage.unrollFraction</code></td>
+  <td>0.2</td>
+  <td>
+    (deprecated) This is read only if <code>spark.memory.useLegacyMode</code> is enabled.
+    Fraction of <code>spark.storage.memoryFraction</code> to use for unrolling blocks in memory.
+    This is dynamically allocated by dropping existing blocks when there is not enough free
+    storage space to unroll the new block in its entirety.
+  </td>
+</tr>
+</table>
+
 #### Execution Behavior
 <table class="table">
 <tr><th>Property Name</th><th>Default</th><th>Meaning</th></tr>
@@ -824,15 +883,6 @@ Apart from these, the following properties are also available, and may be useful
     This setting is ignored for jobs generated through Spark Streaming's StreamingContext, since
     data may need to be rewritten to pre-existing output directories during checkpoint recovery.</td>
 </tr>
-<tr>
-  <td><code>spark.storage.memoryFraction</code></td>
-  <td>0.6</td>
-  <td>
-    Fraction of Java heap to use for Spark's memory cache. This should not be larger than the "old"
-    generation of objects in the JVM, which by default is given 0.6 of the heap, but you can
-    increase it if you configure your own old generation size.
-  </td>
-</tr>
 <tr>
   <td><code>spark.storage.memoryMapThreshold</code></td>
   <td>2m</td>
@@ -842,15 +892,6 @@ Apart from these, the following properties are also available, and may be useful
     mapping has high overhead for blocks close to or below the page size of the operating system.
   </td>
 </tr>
-<tr>
-  <td><code>spark.storage.unrollFraction</code></td>
-  <td>0.2</td>
-  <td>
-    Fraction of <code>spark.storage.memoryFraction</code> to use for unrolling blocks in memory.
-    This is dynamically allocated by dropping existing blocks when there is not enough free
-    storage space to unroll the new block in its entirety.
-  </td>
-</tr>
 <tr>
   <td><code>spark.externalBlockStore.blockManager</code></td>
   <td>org.apache.spark.storage.TachyonBlockManager</td>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
index ff65d7bdf8b92..835f52fa566a2 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
@@ -57,7 +57,9 @@ class TestShuffleMemoryManager
 }
 
 private class GrantEverythingMemoryManager extends MemoryManager {
-  override def acquireExecutionMemory(numBytes: Long): Long = numBytes
+  override def acquireExecutionMemory(
+      numBytes: Long,
+      evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Long = numBytes
   override def acquireStorageMemory(
       blockId: BlockId,
       numBytes: Long,
@@ -66,12 +68,6 @@ private class GrantEverythingMemoryManager extends MemoryManager {
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
-  override def releaseExecutionMemory(numBytes: Long): Unit = { }
-  override def releaseStorageMemory(numBytes: Long): Unit = { }
-  override def releaseStorageMemory(): Unit = { }
-  override def releaseUnrollMemory(numBytes: Long): Unit = { }
   override def maxExecutionMemory: Long = Long.MaxValue
   override def maxStorageMemory: Long = Long.MaxValue
-  override def executionMemoryUsed: Long = Long.MaxValue
-  override def storageMemoryUsed: Long = Long.MaxValue
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
index f7d48bc53ebbc..75d1fced594c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/UnsafeRowSerializerSuite.scala
@@ -103,7 +103,7 @@ class UnsafeRowSerializerSuite extends SparkFunSuite with LocalSparkContext {
       val conf = new SparkConf()
         .set("spark.shuffle.spill.initialMemoryThreshold", "1024")
         .set("spark.shuffle.sort.bypassMergeThreshold", "0")
-        .set("spark.shuffle.memoryFraction", "0.0001")
+        .set("spark.testing.memory", "80000")
 
       sc = new SparkContext("local", "test", conf)
       outputFile = File.createTempFile("test-unsafe-row-serializer-spill", "")

From 0d1b73b78b600420121ea8e58ff659ae8b4feebe Mon Sep 17 00:00:00 2001
From: trystanleftwich <trystan@atscale.com>
Date: Tue, 13 Oct 2015 22:11:08 +0100
Subject: [PATCH 040/139] =?UTF-8?q?[SPARK-11052]=20Spaces=20in=20the=20bui?=
 =?UTF-8?q?ld=20dir=20causes=20failures=20in=20the=20build/mv=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…n script

Author: trystanleftwich <trystan@atscale.com>

Closes #9065 from trystanleftwich/SPARK-11052.
---
 build/mvn            | 10 +++++-----
 make-distribution.sh |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/build/mvn b/build/mvn
index ec0380afad319..7603ea03deb73 100755
--- a/build/mvn
+++ b/build/mvn
@@ -104,8 +104,8 @@ install_scala() {
     "scala-${scala_version}.tgz" \
     "scala-${scala_version}/bin/scala"
 
-  SCALA_COMPILER="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-compiler.jar"
-  SCALA_LIBRARY="$(cd "$(dirname ${scala_bin})/../lib" && pwd)/scala-library.jar"
+  SCALA_COMPILER="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-compiler.jar"
+  SCALA_LIBRARY="$(cd "$(dirname "${scala_bin}")/../lib" && pwd)/scala-library.jar"
 }
 
 # Setup healthy defaults for the Zinc port if none were provided from
@@ -135,10 +135,10 @@ cd "${_CALLING_DIR}"
 
 # Now that zinc is ensured to be installed, check its status and, if its
 # not running or just installed, start it
-if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`${ZINC_BIN} -status -port ${ZINC_PORT}`" ]; then
+if [ -n "${ZINC_INSTALL_FLAG}" -o -z "`"${ZINC_BIN}" -status -port ${ZINC_PORT}`" ]; then
   export ZINC_OPTS=${ZINC_OPTS:-"$_COMPILE_JVM_OPTS"}
-  ${ZINC_BIN} -shutdown -port ${ZINC_PORT}
-  ${ZINC_BIN} -start -port ${ZINC_PORT} \
+  "${ZINC_BIN}" -shutdown -port ${ZINC_PORT}
+  "${ZINC_BIN}" -start -port ${ZINC_PORT} \
     -scala-compiler "${SCALA_COMPILER}" \
     -scala-library "${SCALA_LIBRARY}" &>/dev/null
 fi
diff --git a/make-distribution.sh b/make-distribution.sh
index 62c0ba6df7d3f..24418ace26270 100755
--- a/make-distribution.sh
+++ b/make-distribution.sh
@@ -121,7 +121,7 @@ if [ $(command -v git) ]; then
 fi
 
 
-if [ ! $(command -v "$MVN") ] ; then
+if [ ! "$(command -v "$MVN")" ] ; then
     echo -e "Could not locate Maven command: '$MVN'."
     echo -e "Specify the Maven command with the --mvn flag"
     exit -1;

From ef72673b234579c161b8cbb6cafc851d9eba1bfb Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 13 Oct 2015 15:09:31 -0700
Subject: [PATCH 041/139] [SPARK-11080] [SQL] Incorporate per-JVM id into
 ExprId to prevent unsafe cross-JVM comparisions

In the current implementation of named expressions' `ExprIds`, we rely on a per-JVM AtomicLong to ensure that expression ids are unique within a JVM. However, these expression ids will not be _globally_ unique. This opens the potential for id collisions if new expression ids happen to be created inside of tasks rather than on the driver.

There are currently a few cases where tasks allocate expression ids, which happen to be safe because those expressions are never compared to expressions created on the driver. In order to guard against the introduction of invalid comparisons between driver-created and executor-created expression ids, this patch extends `ExprId` to incorporate a UUID to identify the JVM that created the id, which prevents collisions.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9093 from JoshRosen/SPARK-11080.
---
 .../catalyst/expressions/namedExpressions.scala   | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index 5768c6087db32..8957df0be6814 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.util.UUID
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
 import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -24,16 +26,23 @@ import org.apache.spark.sql.types._
 
 object NamedExpression {
   private val curId = new java.util.concurrent.atomic.AtomicLong()
-  def newExprId: ExprId = ExprId(curId.getAndIncrement())
+  private[expressions] val jvmId = UUID.randomUUID()
+  def newExprId: ExprId = ExprId(curId.getAndIncrement(), jvmId)
   def unapply(expr: NamedExpression): Option[(String, DataType)] = Some(expr.name, expr.dataType)
 }
 
 /**
- * A globally unique (within this JVM) id for a given named expression.
+ * A globally unique id for a given named expression.
  * Used to identify which attribute output by a relation is being
  * referenced in a subsequent computation.
+ *
+ * The `id` field is unique within a given JVM, while the `uuid` is used to uniquely identify JVMs.
  */
-case class ExprId(id: Long)
+case class ExprId(id: Long, jvmId: UUID)
+
+object ExprId {
+  def apply(id: Long): ExprId = ExprId(id, NamedExpression.jvmId)
+}
 
 /**
  * An [[Expression]] that is named.

From d0482f6af33e976db237405b2a978db1b7c2fd5b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 13 Oct 2015 15:18:20 -0700
Subject: [PATCH 042/139] [SPARK-10932] [PROJECT INFRA] Port two minor changes
 to release-build.sh from scripts' old repo

Spark's release packaging scripts used to live in a separate repository. Although these scripts are now part of the Spark repo, there are some minor patches made against the old repos that are missing in Spark's copy of the script. This PR ports those changes.

/cc shivaram, who originally submitted these changes against https://github.com/rxin/spark-utils

Author: Josh Rosen <joshrosen@databricks.com>

Closes #8986 from JoshRosen/port-release-build-fixes-from-rxin-repo.
---
 dev/create-release/release-build.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
index 9dac43ce54425..cb79e9eba06e2 100755
--- a/dev/create-release/release-build.sh
+++ b/dev/create-release/release-build.sh
@@ -70,7 +70,7 @@ GIT_REF=${GIT_REF:-master}
 # Destination directory parent on remote server
 REMOTE_PARENT_DIR=${REMOTE_PARENT_DIR:-/home/$ASF_USERNAME/public_html}
 
-SSH="ssh -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
+SSH="ssh -o ConnectTimeout=300 -o StrictHostKeyChecking=no -i $ASF_RSA_KEY"
 GPG="gpg --no-tty --batch"
 NEXUS_ROOT=https://repository.apache.org/service/local/staging
 NEXUS_PROFILE=d63f592e7eac0 # Profile for Spark staging uploads
@@ -141,8 +141,12 @@ if [[ "$1" == "package" ]]; then
 
     export ZINC_PORT=$ZINC_PORT
     echo "Creating distribution: $NAME ($FLAGS)"
-    ./make-distribution.sh --name $NAME --tgz $FLAGS -DzincPort=$ZINC_PORT 2>&1 > \
-      ../binary-release-$NAME.log
+
+    # Get maven home set by MVN
+    MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
+
+    ./make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
+      -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
     cd ..
     cp spark-$SPARK_VERSION-bin-$NAME/spark-$SPARK_VERSION-bin-$NAME.tgz .
 

From 3889b1c7a96da1111946fa63ad69489b83468646 Mon Sep 17 00:00:00 2001
From: vectorijk <jiangkai@gmail.com>
Date: Tue, 13 Oct 2015 15:57:36 -0700
Subject: [PATCH 043/139] [SPARK-11059] [ML] Change range of quantile
 probabilities in AFTSurvivalRegression

Value of the quantile probabilities array should be in the range (0, 1) instead of [0,1]
 in `AFTSurvivalRegression.scala` according to [Discussion] (https://github.com/apache/spark/pull/8926#discussion-diff-40698242)

Author: vectorijk <jiangkai@gmail.com>

Closes #9083 from vectorijk/spark-11059.
---
 .../apache/spark/ml/regression/AFTSurvivalRegression.scala    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
index 717caacad30eb..ac2c3d825f13c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala
@@ -59,14 +59,14 @@ private[regression] trait AFTSurvivalRegressionParams extends Params
 
   /**
    * Param for quantile probabilities array.
-   * Values of the quantile probabilities array should be in the range [0, 1]
+   * Values of the quantile probabilities array should be in the range (0, 1)
    * and the array should be non-empty.
    * @group param
    */
   @Since("1.6.0")
   final val quantileProbabilities: DoubleArrayParam = new DoubleArrayParam(this,
     "quantileProbabilities", "quantile probabilities array",
-    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1)) && t.length > 0)
+    (t: Array[Double]) => t.forall(ParamValidators.inRange(0, 1, false, false)) && t.length > 0)
 
   /** @group getParam */
   @Since("1.6.0")

From 328d1b3e4bc39cce653342e04f9e08af12dd7ed8 Mon Sep 17 00:00:00 2001
From: Michael Armbrust <michael@databricks.com>
Date: Tue, 13 Oct 2015 17:09:17 -0700
Subject: [PATCH 044/139] [SPARK-11090] [SQL] Constructor for Product types
 from InternalRow

This is a first draft of the ability to construct expressions that will take a catalyst internal row and construct a Product (case class or tuple) that has fields with the correct names.  Support include:
 - Nested classes
 - Maps
 - Efficiently handling of arrays of primitive types

Not yet supported:
 - Case classes that require custom collection types (i.e. List instead of Seq).

Author: Michael Armbrust <michael@databricks.com>

Closes #9100 from marmbrus/productContructor.
---
 .../catalyst/expressions/UnsafeArrayData.java |   4 +
 .../spark/sql/catalyst/ScalaReflection.scala  | 302 +++++++++++++-
 .../spark/sql/catalyst/encoders/Encoder.scala |  14 +
 .../catalyst/encoders/ProductEncoder.scala    |  26 +-
 .../sql/catalyst/expressions/objects.scala    | 154 +++++++-
 .../spark/sql/types/ArrayBasedMapData.scala   |   4 +
 .../apache/spark/sql/types/ArrayData.scala    |   5 +
 .../spark/sql/types/GenericArrayData.scala    |   4 +-
 .../encoders/ProductEncoderSuite.scala        | 369 +++++++++++-------
 9 files changed, 723 insertions(+), 159 deletions(-)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 796f8abec9a1d..4c63abb071e3b 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -74,6 +74,10 @@ private void assertIndexIsValid(int ordinal) {
     assert ordinal < numElements : "ordinal (" + ordinal + ") should < " + numElements;
   }
 
+  public Object[] array() {
+    throw new UnsupportedOperationException("Only supported on GenericArrayData.");
+  }
+
   /**
    * Construct a new UnsafeArrayData. The resulting UnsafeArrayData won't be usable until
    * `pointTo()` has been called, since the value returned by this constructor is equivalent
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 8b733f2a0b91f..8edd6498e5163 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst
 
+import org.apache.spark.sql.catalyst.analysis.{UnresolvedExtractValue, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.Utils
@@ -80,6 +81,9 @@ trait ScalaReflection {
    * Returns the Spark SQL DataType for a given scala type.  Where this is not an exact mapping
    * to a native type, an ObjectType is returned. Special handling is also used for Arrays including
    * those that hold primitive types.
+   *
+   * Unlike `schemaFor`, this function doesn't do any massaging of types into the Spark SQL type
+   * system.  As a result, ObjectType will be returned for things like boxed Integers
    */
   def dataTypeFor(tpe: `Type`): DataType = tpe match {
     case t if t <:< definitions.IntTpe => IntegerType
@@ -114,6 +118,298 @@ trait ScalaReflection {
       }
   }
 
+  /**
+   * Given a type `T` this function constructs and ObjectType that holds a class of type
+   * Array[T].  Special handling is performed for primitive types to map them back to their raw
+   * JVM form instead of the Scala Array that handles auto boxing.
+   */
+  def arrayClassFor(tpe: `Type`): DataType = {
+    val cls = tpe match {
+      case t if t <:< definitions.IntTpe => classOf[Array[Int]]
+      case t if t <:< definitions.LongTpe => classOf[Array[Long]]
+      case t if t <:< definitions.DoubleTpe => classOf[Array[Double]]
+      case t if t <:< definitions.FloatTpe => classOf[Array[Float]]
+      case t if t <:< definitions.ShortTpe => classOf[Array[Short]]
+      case t if t <:< definitions.ByteTpe => classOf[Array[Byte]]
+      case t if t <:< definitions.BooleanTpe => classOf[Array[Boolean]]
+      case other =>
+        // There is probably a better way to do this, but I couldn't find it...
+        val elementType = dataTypeFor(other).asInstanceOf[ObjectType].cls
+        java.lang.reflect.Array.newInstance(elementType, 1).getClass
+
+    }
+    ObjectType(cls)
+  }
+
+  /**
+   * Returns an expression that can be used to construct an object of type `T` given a an input
+   * row with a compatible schema.  Fields of the row will be extracted using UnresolvedAttributes
+   * of the same name as the constructor arguments.  Nested classes will have their fields accessed
+   * using UnresolvedExtractValue.
+   */
+  def constructorFor[T : TypeTag]: Expression = constructorFor(typeOf[T], None)
+
+  protected def constructorFor(
+      tpe: `Type`,
+      path: Option[Expression]): Expression = ScalaReflectionLock.synchronized {
+
+    /** Returns the current path with a sub-field extracted. */
+    def addToPath(part: String) =
+      path
+        .map(p => UnresolvedExtractValue(p, expressions.Literal(part)))
+        .getOrElse(UnresolvedAttribute(part))
+
+    /** Returns the current path or throws an error. */
+    def getPath = path.getOrElse(sys.error("Constructors must start at a class type"))
+
+    tpe match {
+      case t if !dataTypeFor(t).isInstanceOf[ObjectType] =>
+        getPath
+
+      case t if t <:< localTypeOf[Option[_]] =>
+        val TypeRef(_, _, Seq(optType)) = t
+        val boxedType = optType match {
+          // For primitive types we must manually box the primitive value.
+          case t if t <:< definitions.IntTpe => Some(classOf[java.lang.Integer])
+          case t if t <:< definitions.LongTpe => Some(classOf[java.lang.Long])
+          case t if t <:< definitions.DoubleTpe => Some(classOf[java.lang.Double])
+          case t if t <:< definitions.FloatTpe => Some(classOf[java.lang.Float])
+          case t if t <:< definitions.ShortTpe => Some(classOf[java.lang.Short])
+          case t if t <:< definitions.ByteTpe => Some(classOf[java.lang.Byte])
+          case t if t <:< definitions.BooleanTpe => Some(classOf[java.lang.Boolean])
+          case _ => None
+        }
+
+        boxedType.map { boxedType =>
+          val objectType = ObjectType(boxedType)
+          WrapOption(
+            objectType,
+            NewInstance(
+              boxedType,
+              getPath :: Nil,
+              propagateNull = true,
+              objectType))
+        }.getOrElse {
+          val className: String = optType.erasure.typeSymbol.asClass.fullName
+          val cls = Utils.classForName(className)
+          val objectType = ObjectType(cls)
+
+          WrapOption(objectType, constructorFor(optType, path))
+        }
+
+      case t if t <:< localTypeOf[java.lang.Integer] =>
+        val boxedType = classOf[java.lang.Integer]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Long] =>
+        val boxedType = classOf[java.lang.Long]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Double] =>
+        val boxedType = classOf[java.lang.Double]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Float] =>
+        val boxedType = classOf[java.lang.Float]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Short] =>
+        val boxedType = classOf[java.lang.Short]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Byte] =>
+        val boxedType = classOf[java.lang.Byte]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.lang.Boolean] =>
+        val boxedType = classOf[java.lang.Boolean]
+        val objectType = ObjectType(boxedType)
+        NewInstance(boxedType, getPath :: Nil, propagateNull = true, objectType)
+
+      case t if t <:< localTypeOf[java.sql.Date] =>
+        StaticInvoke(
+          DateTimeUtils,
+          ObjectType(classOf[java.sql.Date]),
+          "toJavaDate",
+          getPath :: Nil,
+          propagateNull = true)
+
+      case t if t <:< localTypeOf[java.sql.Timestamp] =>
+        StaticInvoke(
+          DateTimeUtils,
+          ObjectType(classOf[java.sql.Timestamp]),
+          "toJavaTimestamp",
+          getPath :: Nil,
+          propagateNull = true)
+
+      case t if t <:< localTypeOf[java.lang.String] =>
+        Invoke(getPath, "toString", ObjectType(classOf[String]))
+
+      case t if t <:< localTypeOf[java.math.BigDecimal] =>
+        Invoke(getPath, "toJavaBigDecimal", ObjectType(classOf[java.math.BigDecimal]))
+
+      case t if t <:< localTypeOf[Array[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+        val elementDataType = dataTypeFor(elementType)
+        val Schema(dataType, nullable) = schemaFor(elementType)
+
+        val primitiveMethod = elementType match {
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        primitiveMethod.map { method =>
+          Invoke(getPath, method, dataTypeFor(t))
+        }.getOrElse {
+          val returnType = dataTypeFor(t)
+          Invoke(
+            MapObjects(p => constructorFor(elementType, Some(p)), getPath, dataType),
+            "array",
+            returnType)
+        }
+
+      case t if t <:< localTypeOf[Map[_, _]] =>
+        val TypeRef(_, _, Seq(keyType, valueType)) = t
+        val Schema(keyDataType, _) = schemaFor(keyType)
+        val Schema(valueDataType, valueNullable) = schemaFor(valueType)
+
+        val primitiveMethodKey = keyType match {
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        val keyData =
+          Invoke(
+            MapObjects(
+              p => constructorFor(keyType, Some(p)),
+              Invoke(getPath, "keyArray", ArrayType(keyDataType)),
+              keyDataType),
+            "array",
+            ObjectType(classOf[Array[Any]]))
+
+        val primitiveMethodValue = valueType match {
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        val valueData =
+          Invoke(
+            MapObjects(
+              p => constructorFor(valueType, Some(p)),
+              Invoke(getPath, "valueArray", ArrayType(valueDataType)),
+              valueDataType),
+            "array",
+            ObjectType(classOf[Array[Any]]))
+
+        StaticInvoke(
+          ArrayBasedMapData,
+          ObjectType(classOf[Map[_, _]]),
+          "toScalaMap",
+          keyData :: valueData :: Nil)
+
+      case t if t <:< localTypeOf[Seq[_]] =>
+        val TypeRef(_, _, Seq(elementType)) = t
+        val elementDataType = dataTypeFor(elementType)
+        val Schema(dataType, nullable) = schemaFor(elementType)
+
+        // Avoid boxing when possible by just wrapping a primitive array.
+        val primitiveMethod = elementType match {
+          case _ if nullable => None
+          case t if t <:< definitions.IntTpe => Some("toIntArray")
+          case t if t <:< definitions.LongTpe => Some("toLongArray")
+          case t if t <:< definitions.DoubleTpe => Some("toDoubleArray")
+          case t if t <:< definitions.FloatTpe => Some("toFloatArray")
+          case t if t <:< definitions.ShortTpe => Some("toShortArray")
+          case t if t <:< definitions.ByteTpe => Some("toByteArray")
+          case t if t <:< definitions.BooleanTpe => Some("toBooleanArray")
+          case _ => None
+        }
+
+        val arrayData = primitiveMethod.map { method =>
+          Invoke(getPath, method, arrayClassFor(elementType))
+        }.getOrElse {
+          Invoke(
+            MapObjects(p => constructorFor(elementType, Some(p)), getPath, dataType),
+            "array",
+            arrayClassFor(elementType))
+        }
+
+        StaticInvoke(
+          scala.collection.mutable.WrappedArray,
+          ObjectType(classOf[Seq[_]]),
+          "make",
+          arrayData :: Nil)
+
+
+      case t if t <:< localTypeOf[Product] =>
+        val formalTypeArgs = t.typeSymbol.asClass.typeParams
+        val TypeRef(_, _, actualTypeArgs) = t
+        val constructorSymbol = t.member(nme.CONSTRUCTOR)
+        val params = if (constructorSymbol.isMethod) {
+          constructorSymbol.asMethod.paramss
+        } else {
+          // Find the primary constructor, and use its parameter ordering.
+          val primaryConstructorSymbol: Option[Symbol] =
+            constructorSymbol.asTerm.alternatives.find(s =>
+              s.isMethod && s.asMethod.isPrimaryConstructor)
+
+          if (primaryConstructorSymbol.isEmpty) {
+            sys.error("Internal SQL error: Product object did not have a primary constructor.")
+          } else {
+            primaryConstructorSymbol.get.asMethod.paramss
+          }
+        }
+
+        val className: String = t.erasure.typeSymbol.asClass.fullName
+        val cls = Utils.classForName(className)
+
+        val arguments = params.head.map { p =>
+          val fieldName = p.name.toString
+          val fieldType = p.typeSignature.substituteTypes(formalTypeArgs, actualTypeArgs)
+          val dataType = dataTypeFor(fieldType)
+
+          constructorFor(fieldType, Some(addToPath(fieldName)))
+        }
+
+        val newInstance = NewInstance(cls, arguments, propagateNull = false, ObjectType(cls))
+
+        if (path.nonEmpty) {
+          expressions.If(
+            IsNull(getPath),
+            expressions.Literal.create(null, ObjectType(cls)),
+            newInstance
+          )
+        } else {
+          newInstance
+        }
+
+    }
+  }
+
   /** Returns expressions for extracting all the fields from the given type. */
   def extractorsFor[T : TypeTag](inputObject: Expression): Seq[Expression] = {
     ScalaReflectionLock.synchronized {
@@ -227,13 +523,13 @@ trait ScalaReflection {
           val elementDataType = dataTypeFor(elementType)
           val Schema(dataType, nullable) = schemaFor(elementType)
 
-          if (!elementDataType.isInstanceOf[AtomicType]) {
-            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
-          } else {
+          if (dataType.isInstanceOf[AtomicType]) {
             NewInstance(
               classOf[GenericArrayData],
               inputObject :: Nil,
               dataType = ArrayType(dataType, nullable))
+          } else {
+            MapObjects(extractorFor(_, elementType), inputObject, elementDataType)
           }
 
         case t if t <:< localTypeOf[Map[_, _]] =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
index 8dacfa9477ee6..3618247d5d51a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
+
 import scala.reflect.ClassTag
 
+import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.types.StructType
 
@@ -41,4 +43,16 @@ trait Encoder[T] {
    * copy the result before making another call if required.
    */
   def toRow(t: T): InternalRow
+
+  /**
+   * Returns an object of type `T`, extracting the required values from the provided row.  Note that
+   * you must bind` and encoder to a specific schema before you can call this function.
+   */
+  def fromRow(row: InternalRow): T
+
+  /**
+   * Returns a new copy of this encoder, where the expressions used by `fromRow` are bound to the
+   * given schema
+   */
+  def bind(schema: Seq[Attribute]): Encoder[T]
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
index a23613673ebb5..b0381880c3bdb 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
 
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{typeTag, TypeTag}
@@ -31,7 +33,7 @@ import org.apache.spark.sql.types.{ObjectType, StructType}
  * internal binary representation.
  */
 object ProductEncoder {
-  def apply[T <: Product : TypeTag]: Encoder[T] = {
+  def apply[T <: Product : TypeTag]: ClassEncoder[T] = {
     // We convert the not-serializable TypeTag into StructType and ClassTag.
     val schema = ScalaReflection.schemaFor[T].dataType.asInstanceOf[StructType]
     val mirror = typeTag[T].mirror
@@ -39,7 +41,8 @@ object ProductEncoder {
 
     val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
     val extractExpressions = ScalaReflection.extractorsFor[T](inputObject)
-    new ClassEncoder[T](schema, extractExpressions, ClassTag[T](cls))
+    val constructExpression = ScalaReflection.constructorFor[T]
+    new ClassEncoder[T](schema, extractExpressions, constructExpression, ClassTag[T](cls))
   }
 }
 
@@ -54,14 +57,31 @@ object ProductEncoder {
 case class ClassEncoder[T](
     schema: StructType,
     extractExpressions: Seq[Expression],
+    constructExpression: Expression,
     clsTag: ClassTag[T])
   extends Encoder[T] {
 
   private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
   private val inputRow = new GenericMutableRow(1)
 
+  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
+  private val dataType = ObjectType(clsTag.runtimeClass)
+
   override def toRow(t: T): InternalRow = {
     inputRow(0) = t
     extractProjection(inputRow)
   }
+
+  override def fromRow(row: InternalRow): T = {
+    constructProjection(row).get(0, dataType).asInstanceOf[T]
+  }
+
+  override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
+    val plan = Project(Alias(constructExpression, "object")() :: Nil, LocalRelation(schema))
+    val analyzedPlan = SimpleAnalyzer.execute(plan)
+    val resolvedExpression = analyzedPlan.expressions.head.children.head
+    val boundExpression = BindReferences.bindReference(resolvedExpression, schema)
+
+    copy(constructExpression = boundExpression)
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
index e1f960a6e605c..e8c1c93cf5620 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -17,9 +17,12 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation}
+
 import scala.language.existentials
 
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 
@@ -48,7 +51,7 @@ case class StaticInvoke(
     case other => other.getClass.getName.stripSuffix("$")
   }
   override def nullable: Boolean = true
-  override def children: Seq[Expression] = Nil
+  override def children: Seq[Expression] = arguments
 
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
@@ -69,7 +72,7 @@ case class StaticInvoke(
       s"""
         ${argGen.map(_.code).mkString("\n")}
 
-        boolean ${ev.isNull} = true;
+        boolean ${ev.isNull} = !$argsNonNull;
         $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
 
         if ($argsNonNull) {
@@ -81,8 +84,8 @@ case class StaticInvoke(
       s"""
         ${argGen.map(_.code).mkString("\n")}
 
-        final boolean ${ev.isNull} = ${ev.value} == null;
         $javaType ${ev.value} = $objectName.$functionName($argString);
+        final boolean ${ev.isNull} = ${ev.value} == null;
       """
     }
   }
@@ -92,6 +95,10 @@ case class StaticInvoke(
  * Calls the specified function on an object, optionally passing arguments.  If the `targetObject`
  * expression evaluates to null then null will be returned.
  *
+ * In some cases, due to erasure, the schema may expect a primitive type when in fact the method
+ * is returning java.lang.Object.  In this case, we will generate code that attempts to unbox the
+ * value automatically.
+ *
  * @param targetObject An expression that will return the object to call the method on.
  * @param functionName The name of the method to call.
  * @param dataType The expected return type of the function.
@@ -109,6 +116,35 @@ case class Invoke(
   override def eval(input: InternalRow): Any =
     throw new UnsupportedOperationException("Only code-generated evaluation is supported.")
 
+  lazy val method = targetObject.dataType match {
+    case ObjectType(cls) =>
+      cls
+        .getMethods
+        .find(_.getName == functionName)
+        .getOrElse(sys.error(s"Couldn't find $functionName on $cls"))
+        .getReturnType
+        .getName
+    case _ => ""
+  }
+
+  lazy val unboxer = (dataType, method) match {
+    case (IntegerType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Integer)$s).intValue()"
+    case (LongType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Long)$s).longValue()"
+    case (FloatType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Float)$s).floatValue()"
+    case (ShortType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Short)$s).shortValue()"
+    case (ByteType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Byte)$s).byteValue()"
+    case (DoubleType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Double)$s).doubleValue()"
+    case (BooleanType, "java.lang.Object") => (s: String) =>
+      s"((java.lang.Boolean)$s).booleanValue()"
+    case _ => identity[String] _
+  }
+
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
     val javaType = ctx.javaType(dataType)
     val obj = targetObject.gen(ctx)
@@ -123,6 +159,8 @@ case class Invoke(
       ""
     }
 
+    val value = unboxer(s"${obj.value}.$functionName($argString)")
+
     s"""
       ${obj.code}
       ${argGen.map(_.code).mkString("\n")}
@@ -130,7 +168,7 @@ case class Invoke(
       boolean ${ev.isNull} = ${obj.value} == null;
       $javaType ${ev.value} =
         ${ev.isNull} ?
-        ${ctx.defaultValue(dataType)} : ($javaType) ${obj.value}.$functionName($argString);
+        ${ctx.defaultValue(dataType)} : ($javaType) $value;
       $objNullCheck
     """
   }
@@ -190,8 +228,8 @@ case class NewInstance(
       s"""
         ${argGen.map(_.code).mkString("\n")}
 
-        final boolean ${ev.isNull} = ${ev.value} == null;
         $javaType ${ev.value} = new $className($argString);
+        final boolean ${ev.isNull} = ${ev.value} == null;
       """
     }
   }
@@ -210,8 +248,6 @@ case class UnwrapOption(
 
   override def nullable: Boolean = true
 
-  override def children: Seq[Expression] = Nil
-
   override def inputTypes: Seq[AbstractDataType] = ObjectType :: Nil
 
   override def eval(input: InternalRow): Any =
@@ -231,6 +267,43 @@ case class UnwrapOption(
   }
 }
 
+/**
+ * Converts the result of evaluating `child` into an option, checking both the isNull bit and
+ * (in the case of reference types) equality with null.
+ * @param optionType The datatype to be held inside of the Option.
+ * @param child The expression to evaluate and wrap.
+ */
+case class WrapOption(optionType: DataType, child: Expression)
+  extends UnaryExpression with ExpectsInputTypes {
+
+  override def dataType: DataType = ObjectType(classOf[Option[_]])
+
+  override def nullable: Boolean = true
+
+  override def inputTypes: Seq[AbstractDataType] = ObjectType :: Nil
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val javaType = ctx.javaType(optionType)
+    val inputObject = child.gen(ctx)
+
+    s"""
+      ${inputObject.code}
+
+      boolean ${ev.isNull} = false;
+      scala.Option<$javaType> ${ev.value} =
+        ${inputObject.isNull} ?
+        scala.Option$$.MODULE$$.apply(null) : new scala.Some(${inputObject.value});
+    """
+  }
+}
+
+/**
+ * A place holder for the loop variable used in [[MapObjects]].  This should never be constructed
+ * manually, but will instead be passed into the provided lambda function.
+ */
 case class LambdaVariable(value: String, isNull: String, dataType: DataType) extends Expression {
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String =
@@ -251,7 +324,7 @@ case class LambdaVariable(value: String, isNull: String, dataType: DataType) ext
  * as an ArrayType.  This is similar to a typical map operation, but where the lambda function
  * is expressed using catalyst expressions.
  *
- * The following collection ObjectTypes are currently supported: Seq, Array
+ * The following collection ObjectTypes are currently supported: Seq, Array, ArrayData
  *
  * @param function A function that returns an expression, given an attribute that can be used
  *                 to access the current value.  This is does as a lambda function so that
@@ -265,14 +338,32 @@ case class MapObjects(
     inputData: Expression,
     elementType: DataType) extends Expression {
 
-  private val loopAttribute = AttributeReference("loopVar", elementType)()
-  private val completeFunction = function(loopAttribute)
+  private lazy val loopAttribute = AttributeReference("loopVar", elementType)()
+  private lazy val completeFunction = function(loopAttribute)
 
-  private val (lengthFunction, itemAccessor) = inputData.dataType match {
-    case ObjectType(cls) if cls.isAssignableFrom(classOf[Seq[_]]) =>
-      (".size()", (i: String) => s".apply($i)")
+  private lazy val (lengthFunction, itemAccessor, primitiveElement) = inputData.dataType match {
+    case ObjectType(cls) if classOf[Seq[_]].isAssignableFrom(cls) =>
+      (".size()", (i: String) => s".apply($i)", false)
     case ObjectType(cls) if cls.isArray =>
-      (".length", (i: String) => s"[$i]")
+      (".length", (i: String) => s"[$i]", false)
+    case ArrayType(s: StructType, _) =>
+      (".numElements()", (i: String) => s".getStruct($i, ${s.size})", false)
+    case ArrayType(a: ArrayType, _) =>
+      (".numElements()", (i: String) => s".getArray($i)", true)
+    case ArrayType(IntegerType, _) =>
+      (".numElements()", (i: String) => s".getInt($i)", true)
+    case ArrayType(LongType, _) =>
+      (".numElements()", (i: String) => s".getLong($i)", true)
+    case ArrayType(FloatType, _) =>
+      (".numElements()", (i: String) => s".getFloat($i)", true)
+    case ArrayType(DoubleType, _) =>
+      (".numElements()", (i: String) => s".getDouble($i)", true)
+    case ArrayType(ByteType, _) =>
+      (".numElements()", (i: String) => s".getByte($i)", true)
+    case ArrayType(ShortType, _) =>
+      (".numElements()", (i: String) => s".getShort($i)", true)
+    case ArrayType(BooleanType, _) =>
+      (".numElements()", (i: String) => s".getBoolean($i)", true)
   }
 
   override def nullable: Boolean = true
@@ -294,15 +385,38 @@ case class MapObjects(
     val loopIsNull = ctx.freshName("loopIsNull")
 
     val loopVariable = LambdaVariable(loopValue, loopIsNull, elementType)
-    val boundFunction = completeFunction transform {
+    val substitutedFunction = completeFunction transform {
       case a: AttributeReference if a == loopAttribute => loopVariable
     }
+    // A hack to run this through the analyzer (to bind extractions).
+    val boundFunction =
+      SimpleAnalyzer.execute(Project(Alias(substitutedFunction, "")() :: Nil, LocalRelation(Nil)))
+        .expressions.head.children.head
 
     val genFunction = boundFunction.gen(ctx)
     val dataLength = ctx.freshName("dataLength")
     val convertedArray = ctx.freshName("convertedArray")
     val loopIndex = ctx.freshName("loopIndex")
 
+    val convertedType = ctx.javaType(boundFunction.dataType)
+
+    // Because of the way Java defines nested arrays, we have to handle the syntax specially.
+    // Specifically, we have to insert the [$dataLength] in between the type and any extra nested
+    // array declarations (i.e. new String[1][]).
+    val arrayConstructor = if (convertedType contains "[]") {
+      val rawType = convertedType.takeWhile(_ != '[')
+      val arrayPart = convertedType.reverse.takeWhile(c => c == '[' || c == ']').reverse
+      s"new $rawType[$dataLength]$arrayPart"
+    } else {
+      s"new $convertedType[$dataLength]"
+    }
+
+    val loopNullCheck = if (primitiveElement) {
+      s"boolean $loopIsNull = ${genInputData.value}.isNullAt($loopIndex);"
+    } else {
+      s"boolean $loopIsNull = ${genInputData.isNull} || $loopValue == null;"
+    }
+
     s"""
       ${genInputData.code}
 
@@ -310,19 +424,19 @@ case class MapObjects(
       $javaType ${ev.value} = ${ctx.defaultValue(dataType)};
 
       if (!${ev.isNull}) {
-        Object[] $convertedArray = null;
+        $convertedType[] $convertedArray = null;
         int $dataLength = ${genInputData.value}$lengthFunction;
-        $convertedArray = new Object[$dataLength];
+        $convertedArray = $arrayConstructor;
 
         int $loopIndex = 0;
         while ($loopIndex < $dataLength) {
           $elementJavaType $loopValue =
             ($elementJavaType)${genInputData.value}${itemAccessor(loopIndex)};
-          boolean $loopIsNull = $loopValue == null;
+          $loopNullCheck
 
           ${genFunction.code}
 
-          $convertedArray[$loopIndex] = ${genFunction.value};
+          $convertedArray[$loopIndex] = ($convertedType)${genFunction.value};
           $loopIndex += 1;
         }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
index 52069598ee30e..5f22e59d5f1d8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
@@ -62,4 +62,8 @@ object ArrayBasedMapData {
     val values = map.valueArray.asInstanceOf[GenericArrayData].array
     keys.zip(values).toMap
   }
+
+  def toScalaMap(keys: Array[Any], values: Array[Any]): Map[Any, Any] = {
+    keys.zip(values).toMap
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
index 642c56f12ded1..b4ea300f5f306 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayData.scala
@@ -26,6 +26,8 @@ abstract class ArrayData extends SpecializedGetters with Serializable {
 
   def copy(): ArrayData
 
+  def array: Array[Any]
+
   def toBooleanArray(): Array[Boolean] = {
     val size = numElements()
     val values = new Array[Boolean](size)
@@ -103,6 +105,9 @@ abstract class ArrayData extends SpecializedGetters with Serializable {
     values
   }
 
+  def toObjectArray(elementType: DataType): Array[AnyRef] =
+    toArray[AnyRef](elementType: DataType)
+
   def toArray[T: ClassTag](elementType: DataType): Array[T] = {
     val size = numElements()
     val values = new Array[T](size)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
index c3816033275d5..9448d88d6c5f0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/GenericArrayData.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.types
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}
 
-class GenericArrayData(private[sql] val array: Array[Any]) extends ArrayData {
+class GenericArrayData(val array: Array[Any]) extends ArrayData {
 
   def this(seq: scala.collection.GenIterable[Any]) = this(seq.toArray)
 
@@ -29,6 +29,8 @@ class GenericArrayData(private[sql] val array: Array[Any]) extends ArrayData {
   def this(primitiveArray: Array[Long]) = this(primitiveArray.toSeq)
   def this(primitiveArray: Array[Float]) = this(primitiveArray.toSeq)
   def this(primitiveArray: Array[Double]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Short]) = this(primitiveArray.toSeq)
+  def this(primitiveArray: Array[Byte]) = this(primitiveArray.toSeq)
   def this(primitiveArray: Array[Boolean]) = this(primitiveArray.toSeq)
 
   override def copy(): ArrayData = new GenericArrayData(array.clone())
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
index 99c993d3febc2..02e43ddb35478 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoderSuite.scala
@@ -17,158 +17,263 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
-import java.sql.{Date, Timestamp}
+import java.util
+
+import org.apache.spark.sql.types.{StructField, ArrayType, ArrayData}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.runtime.universe._
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.ScalaReflection._
-import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
 import org.apache.spark.sql.catalyst._
 
-
 case class RepeatedStruct(s: Seq[PrimitiveData])
 
 case class NestedArray(a: Array[Array[Int]])
 
-class ProductEncoderSuite extends SparkFunSuite {
+case class BoxedData(
+    intField: java.lang.Integer,
+    longField: java.lang.Long,
+    doubleField: java.lang.Double,
+    floatField: java.lang.Float,
+    shortField: java.lang.Short,
+    byteField: java.lang.Byte,
+    booleanField: java.lang.Boolean)
 
-  test("convert PrimitiveData to InternalRow") {
-    val inputData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
-    val encoder = ProductEncoder[PrimitiveData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.getInt(0) == 1)
-    assert(convertedData.getLong(1) == 1.toLong)
-    assert(convertedData.getDouble(2) == 1.toDouble)
-    assert(convertedData.getFloat(3) == 1.toFloat)
-    assert(convertedData.getShort(4) == 1.toShort)
-    assert(convertedData.getByte(5) == 1.toByte)
-    assert(convertedData.getBoolean(6) == true)
-  }
+case class RepeatedData(
+    arrayField: Seq[Int],
+    arrayFieldContainsNull: Seq[java.lang.Integer],
+    mapField: scala.collection.Map[Int, Long],
+    mapFieldNull: scala.collection.Map[Int, java.lang.Long],
+    structField: PrimitiveData)
 
-  test("convert Some[_] to InternalRow") {
-    val primitiveData = PrimitiveData(1, 1, 1, 1, 1, 1, true)
-    val inputData = OptionalData(Some(2), Some(2), Some(2), Some(2), Some(2), Some(2), Some(true),
-      Some(primitiveData))
-
-    val encoder = ProductEncoder[OptionalData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.getInt(0) == 2)
-    assert(convertedData.getLong(1) == 2.toLong)
-    assert(convertedData.getDouble(2) == 2.toDouble)
-    assert(convertedData.getFloat(3) == 2.toFloat)
-    assert(convertedData.getShort(4) == 2.toShort)
-    assert(convertedData.getByte(5) == 2.toByte)
-    assert(convertedData.getBoolean(6) == true)
-
-    val nestedRow = convertedData.getStruct(7, 7)
-    assert(nestedRow.getInt(0) == 1)
-    assert(nestedRow.getLong(1) == 1.toLong)
-    assert(nestedRow.getDouble(2) == 1.toDouble)
-    assert(nestedRow.getFloat(3) == 1.toFloat)
-    assert(nestedRow.getShort(4) == 1.toShort)
-    assert(nestedRow.getByte(5) == 1.toByte)
-    assert(nestedRow.getBoolean(6) == true)
-  }
+case class SpecificCollection(l: List[Int])
 
-  test("convert None to InternalRow") {
-    val inputData = OptionalData(None, None, None, None, None, None, None, None)
-    val encoder = ProductEncoder[OptionalData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.isNullAt(0))
-    assert(convertedData.isNullAt(1))
-    assert(convertedData.isNullAt(2))
-    assert(convertedData.isNullAt(3))
-    assert(convertedData.isNullAt(4))
-    assert(convertedData.isNullAt(5))
-    assert(convertedData.isNullAt(6))
-    assert(convertedData.isNullAt(7))
-  }
+class ProductEncoderSuite extends SparkFunSuite {
 
-  test("convert nullable but present data to InternalRow") {
-    val inputData = NullableData(
-      1, 1L, 1.0, 1.0f, 1.toShort, 1.toByte, true, "test", new java.math.BigDecimal(1), new Date(0),
-      new Timestamp(0), Array[Byte](1, 2, 3))
-
-    val encoder = ProductEncoder[NullableData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.getInt(0) == 1)
-    assert(convertedData.getLong(1) == 1.toLong)
-    assert(convertedData.getDouble(2) == 1.toDouble)
-    assert(convertedData.getFloat(3) == 1.toFloat)
-    assert(convertedData.getShort(4) == 1.toShort)
-    assert(convertedData.getByte(5) == 1.toByte)
-    assert(convertedData.getBoolean(6) == true)
-  }
+  encodeDecodeTest(PrimitiveData(1, 1, 1, 1, 1, 1, true))
 
-  test("convert nullable data to InternalRow") {
-    val inputData =
-      NullableData(null, null, null, null, null, null, null, null, null, null, null, null)
-
-    val encoder = ProductEncoder[NullableData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(convertedData.isNullAt(0))
-    assert(convertedData.isNullAt(1))
-    assert(convertedData.isNullAt(2))
-    assert(convertedData.isNullAt(3))
-    assert(convertedData.isNullAt(4))
-    assert(convertedData.isNullAt(5))
-    assert(convertedData.isNullAt(6))
-    assert(convertedData.isNullAt(7))
-    assert(convertedData.isNullAt(8))
-    assert(convertedData.isNullAt(9))
-    assert(convertedData.isNullAt(10))
-    assert(convertedData.isNullAt(11))
-  }
+  // TODO: Support creating specific subclasses of Seq.
+  ignore("Specific collection types") { encodeDecodeTest(SpecificCollection(1 :: Nil)) }
 
-  test("convert repeated struct") {
-    val inputData = RepeatedStruct(PrimitiveData(1, 1, 1, 1, 1, 1, true) :: Nil)
-    val encoder = ProductEncoder[RepeatedStruct]
-
-    val converted = encoder.toRow(inputData)
-    val convertedStruct = converted.getArray(0).getStruct(0, 7)
-    assert(convertedStruct.getInt(0) == 1)
-    assert(convertedStruct.getLong(1) == 1.toLong)
-    assert(convertedStruct.getDouble(2) == 1.toDouble)
-    assert(convertedStruct.getFloat(3) == 1.toFloat)
-    assert(convertedStruct.getShort(4) == 1.toShort)
-    assert(convertedStruct.getByte(5) == 1.toByte)
-    assert(convertedStruct.getBoolean(6) == true)
-  }
+  encodeDecodeTest(
+    OptionalData(
+      Some(2), Some(2), Some(2), Some(2), Some(2), Some(2), Some(true),
+      Some(PrimitiveData(1, 1, 1, 1, 1, 1, true))))
 
-  test("convert nested seq") {
-    val convertedData = ProductEncoder[Tuple1[Seq[Seq[Int]]]].toRow(Tuple1(Seq(Seq(1))))
-    assert(convertedData.getArray(0).getArray(0).getInt(0) == 1)
+  encodeDecodeTest(OptionalData(None, None, None, None, None, None, None, None))
 
-    val convertedData2 = ProductEncoder[Tuple1[Seq[Seq[Seq[Int]]]]].toRow(Tuple1(Seq(Seq(Seq(1)))))
-    assert(convertedData2.getArray(0).getArray(0).getArray(0).getInt(0) == 1)
-  }
+  encodeDecodeTest(
+    BoxedData(1, 1L, 1.0, 1.0f, 1.toShort, 1.toByte, true))
 
-  test("convert nested array") {
-    val convertedData = ProductEncoder[Tuple1[Array[Array[Int]]]].toRow(Tuple1(Array(Array(1))))
-  }
+  encodeDecodeTest(
+    BoxedData(null, null, null, null, null, null, null))
+
+  encodeDecodeTest(
+    RepeatedStruct(PrimitiveData(1, 1, 1, 1, 1, 1, true) :: Nil))
 
-  test("convert complex") {
-    val inputData = ComplexData(
+  encodeDecodeTest(
+    RepeatedData(
       Seq(1, 2),
-      Array(1, 2),
-      1 :: 2 :: Nil,
       Seq(new Integer(1), null, new Integer(2)),
       Map(1 -> 2L),
-      Map(1 -> new java.lang.Long(2)),
-      PrimitiveData(1, 1, 1, 1, 1, 1, true),
-      Array(Array(1)))
-
-    val encoder = ProductEncoder[ComplexData]
-    val convertedData = encoder.toRow(inputData)
-
-    assert(!convertedData.isNullAt(0))
-    val seq = convertedData.getArray(0)
-    assert(seq.numElements() == 2)
-    assert(seq.getInt(0) == 1)
-    assert(seq.getInt(1) == 2)
+      Map(1 -> null),
+      PrimitiveData(1, 1, 1, 1, 1, 1, true)))
+
+  encodeDecodeTest(("nullable Seq[Integer]", Seq[Integer](1, null)))
+
+  encodeDecodeTest(("Seq[(String, String)]",
+    Seq(("a", "b"))))
+  encodeDecodeTest(("Seq[(Int, Int)]",
+    Seq((1, 2))))
+  encodeDecodeTest(("Seq[(Long, Long)]",
+    Seq((1L, 2L))))
+  encodeDecodeTest(("Seq[(Float, Float)]",
+    Seq((1.toFloat, 2.toFloat))))
+  encodeDecodeTest(("Seq[(Double, Double)]",
+    Seq((1.toDouble, 2.toDouble))))
+  encodeDecodeTest(("Seq[(Short, Short)]",
+    Seq((1.toShort, 2.toShort))))
+  encodeDecodeTest(("Seq[(Byte, Byte)]",
+    Seq((1.toByte, 2.toByte))))
+  encodeDecodeTest(("Seq[(Boolean, Boolean)]",
+    Seq((true, false))))
+
+  // TODO: Decoding/encoding of complex maps.
+  ignore("complex maps") {
+    encodeDecodeTest(("Map[Int, (String, String)]",
+      Map(1 ->("a", "b"))))
+  }
+
+  encodeDecodeTest(("ArrayBuffer[(String, String)]",
+    ArrayBuffer(("a", "b"))))
+  encodeDecodeTest(("ArrayBuffer[(Int, Int)]",
+    ArrayBuffer((1, 2))))
+  encodeDecodeTest(("ArrayBuffer[(Long, Long)]",
+    ArrayBuffer((1L, 2L))))
+  encodeDecodeTest(("ArrayBuffer[(Float, Float)]",
+    ArrayBuffer((1.toFloat, 2.toFloat))))
+  encodeDecodeTest(("ArrayBuffer[(Double, Double)]",
+    ArrayBuffer((1.toDouble, 2.toDouble))))
+  encodeDecodeTest(("ArrayBuffer[(Short, Short)]",
+    ArrayBuffer((1.toShort, 2.toShort))))
+  encodeDecodeTest(("ArrayBuffer[(Byte, Byte)]",
+    ArrayBuffer((1.toByte, 2.toByte))))
+  encodeDecodeTest(("ArrayBuffer[(Boolean, Boolean)]",
+    ArrayBuffer((true, false))))
+
+  encodeDecodeTest(("Seq[Seq[(Int, Int)]]",
+    Seq(Seq((1, 2)))))
+
+  encodeDecodeTestCustom(("Array[Array[(Int, Int)]]",
+    Array(Array((1, 2)))))
+  { (l, r) => l._2(0)(0) == r._2(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[(Int, Int)]]",
+    Array(Array(Array((1, 2))))))
+  { (l, r) => l._2(0)(0)(0) == r._2(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[(Int, Int)]]]",
+    Array(Array(Array(Array((1, 2)))))))
+  { (l, r) => l._2(0)(0)(0)(0) == r._2(0)(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[Array[(Int, Int)]]]]",
+    Array(Array(Array(Array(Array((1, 2))))))))
+  { (l, r) => l._2(0)(0)(0)(0)(0) == r._2(0)(0)(0)(0)(0) }
+
+
+  encodeDecodeTestCustom(("Array[Array[Integer]]",
+    Array(Array[Integer](1))))
+  { (l, r) => l._2(0)(0) == r._2(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Int]]",
+    Array(Array(1))))
+  { (l, r) => l._2(0)(0) == r._2(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Int]]",
+    Array(Array(Array(1)))))
+  { (l, r) => l._2(0)(0)(0) == r._2(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[Int]]]",
+    Array(Array(Array(Array(1))))))
+  { (l, r) => l._2(0)(0)(0)(0) == r._2(0)(0)(0)(0) }
+
+  encodeDecodeTestCustom(("Array[Array[Array[Array[Int]]]]",
+    Array(Array(Array(Array(Array(1)))))))
+  { (l, r) => l._2(0)(0)(0)(0)(0) == r._2(0)(0)(0)(0)(0) }
+
+  encodeDecodeTest(("Array[Byte] null",
+    null: Array[Byte]))
+  encodeDecodeTestCustom(("Array[Byte]",
+    Array[Byte](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Int] null",
+    null: Array[Int]))
+  encodeDecodeTestCustom(("Array[Int]",
+    Array[Int](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Long] null",
+    null: Array[Long]))
+  encodeDecodeTestCustom(("Array[Long]",
+    Array[Long](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Double] null",
+    null: Array[Double]))
+  encodeDecodeTestCustom(("Array[Double]",
+    Array[Double](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Float] null",
+    null: Array[Float]))
+  encodeDecodeTestCustom(("Array[Float]",
+    Array[Float](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Boolean] null",
+    null: Array[Boolean]))
+  encodeDecodeTestCustom(("Array[Boolean]",
+    Array[Boolean](true, false)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTest(("Array[Short] null",
+    null: Array[Short]))
+  encodeDecodeTestCustom(("Array[Short]",
+    Array[Short](1, 2, 3)))
+    { (l, r) => util.Arrays.equals(l._2, r._2) }
+
+  encodeDecodeTestCustom(("java.sql.Timestamp",
+    new java.sql.Timestamp(1)))
+    { (l, r) => l._2.toString == r._2.toString }
+
+  encodeDecodeTestCustom(("java.sql.Date", new java.sql.Date(1)))
+    { (l, r) => l._2.toString == r._2.toString }
+
+  /** Simplified encodeDecodeTestCustom, where the comparison function can be `Object.equals`. */
+  protected def encodeDecodeTest[T <: Product : TypeTag](inputData: T) =
+    encodeDecodeTestCustom[T](inputData)((l, r) => l == r)
+
+  /**
+   * Constructs a test that round-trips `t` through an encoder, checking the results to ensure it
+   * matches the original.
+   */
+  protected def encodeDecodeTestCustom[T <: Product : TypeTag](
+      inputData: T)(
+      c: (T, T) => Boolean) = {
+    test(s"encode/decode: $inputData") {
+      val encoder = try ProductEncoder[T] catch {
+        case e: Exception =>
+          fail(s"Exception thrown generating encoder", e)
+      }
+      val convertedData = encoder.toRow(inputData)
+      val schema = encoder.schema.toAttributes
+      val boundEncoder = encoder.bind(schema)
+      val convertedBack = try boundEncoder.fromRow(convertedData) catch {
+        case e: Exception =>
+          fail(
+           s"""Exception thrown while decoding
+              |Converted: $convertedData
+              |Schema: ${schema.mkString(",")}
+              |${encoder.schema.treeString}
+              |
+              |Construct Expressions:
+              |${boundEncoder.constructExpression.treeString}
+              |
+            """.stripMargin, e)
+      }
+
+      if (!c(inputData, convertedBack)) {
+        val types =
+          convertedBack.productIterator.filter(_ != null).map(_.getClass.getName).mkString(",")
+
+        val encodedData = convertedData.toSeq(encoder.schema).zip(encoder.schema).map {
+          case (a: ArrayData, StructField(_, at: ArrayType, _, _)) =>
+            a.toArray[Any](at.elementType).toSeq
+          case (other, _) =>
+            other
+        }.mkString("[", ",", "]")
+
+        fail(
+          s"""Encoded/Decoded data does not match input data
+             |
+             |in:  $inputData
+             |out: $convertedBack
+             |types: $types
+             |
+             |Encoded Data: $encodedData
+             |Schema: ${schema.mkString(",")}
+             |${encoder.schema.treeString}
+             |
+             |Extract Expressions:
+             |${boundEncoder.extractExpressions.map(_.treeString).mkString("\n")}
+             |
+             |Construct Expressions:
+             |${boundEncoder.constructExpression.treeString}
+             |
+           """.stripMargin)
+      }
+    }
   }
 }

From e170c22160bb452f98c340489ebf8390116a8cbb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Tue, 13 Oct 2015 17:11:22 -0700
Subject: [PATCH 045/139] [SPARK-11032] [SQL] correctly handle having

We should not stop resolving having when the having condtion is resolved, or something like `count(1)` will crash.

Author: Wenchen Fan <cloud0fan@163.com>

Closes #9105 from cloud-fan/having.
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala    | 2 +-
 .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala  | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f5597a08d3595..041ab22827399 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -553,7 +553,7 @@ class Analyzer(
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
       case filter @ Filter(havingCondition,
              aggregate @ Aggregate(grouping, originalAggExprs, child))
-          if aggregate.resolved && !filter.resolved =>
+          if aggregate.resolved =>
 
         // Try resolving the condition of the filter as though it is in the aggregate clause
         val aggregatedCondition =
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index eca6f1073889a..636591630e136 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1809,4 +1809,13 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
         df1.withColumn("diff", lit(0)))
     }
   }
+
+  test("SPARK-11032: resolve having correctly") {
+    withTempTable("src") {
+      Seq(1 -> "a").toDF("i", "j").registerTempTable("src")
+      checkAnswer(
+        sql("SELECT MIN(t.i) FROM (SELECT * FROM src WHERE i > 0) t HAVING(COUNT(1) > 0)"),
+        Row(1))
+    }
+  }
 }

From 15ff85b3163acbe8052d4489a00bcf1d2332fcf0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Tue, 13 Oct 2015 17:59:32 -0700
Subject: [PATCH 046/139] [SPARK-11068] [SQL] add callback to query execution

With this feature, we can track the query plan, time cost, exception during query execution for spark users.

Author: Wenchen Fan <cloud0fan@163.com>

Closes #9078 from cloud-fan/callback.
---
 .../org/apache/spark/sql/DataFrame.scala      |  46 +++++-
 .../spark/sql/QueryExecutionListener.scala    | 136 ++++++++++++++++++
 .../org/apache/spark/sql/SQLContext.scala     |   3 +
 .../spark/sql/DataFrameCallbackSuite.scala    |  82 +++++++++++
 4 files changed, 261 insertions(+), 6 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index 01f60aba87ede..bfe8d3c8ef957 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1344,7 +1344,9 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def head(n: Int): Array[Row] = limit(n).collect()
+  def head(n: Int): Array[Row] = withCallback("head", limit(n)) { df =>
+    df.collect(needCallback = false)
+  }
 
   /**
    * Returns the first row.
@@ -1414,8 +1416,18 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def collect(): Array[Row] = withNewExecutionId {
-    queryExecution.executedPlan.executeCollectPublic()
+  def collect(): Array[Row] = collect(needCallback = true)
+
+  private def collect(needCallback: Boolean): Array[Row] = {
+    def execute(): Array[Row] = withNewExecutionId {
+      queryExecution.executedPlan.executeCollectPublic()
+    }
+
+    if (needCallback) {
+      withCallback("collect", this)(_ => execute())
+    } else {
+      execute()
+    }
   }
 
   /**
@@ -1423,8 +1435,10 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def collectAsList(): java.util.List[Row] = withNewExecutionId {
-    java.util.Arrays.asList(rdd.collect() : _*)
+  def collectAsList(): java.util.List[Row] = withCallback("collectAsList", this) { _ =>
+    withNewExecutionId {
+      java.util.Arrays.asList(rdd.collect() : _*)
+    }
   }
 
   /**
@@ -1432,7 +1446,9 @@ class DataFrame private[sql](
    * @group action
    * @since 1.3.0
    */
-  def count(): Long = groupBy().count().collect().head.getLong(0)
+  def count(): Long = withCallback("count", groupBy().count()) { df =>
+    df.collect(needCallback = false).head.getLong(0)
+  }
 
   /**
    * Returns a new [[DataFrame]] that has exactly `numPartitions` partitions.
@@ -1936,6 +1952,24 @@ class DataFrame private[sql](
     SQLExecution.withNewExecutionId(sqlContext, queryExecution)(body)
   }
 
+  /**
+   * Wrap a DataFrame action to track the QueryExecution and time cost, then report to the
+   * user-registered callback functions.
+   */
+  private def withCallback[T](name: String, df: DataFrame)(action: DataFrame => T) = {
+    try {
+      val start = System.nanoTime()
+      val result = action(df)
+      val end = System.nanoTime()
+      sqlContext.listenerManager.onSuccess(name, df.queryExecution, end - start)
+      result
+    } catch {
+      case e: Exception =>
+        sqlContext.listenerManager.onFailure(name, df.queryExecution, e)
+        throw e
+    }
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   ////////////////////////////////////////////////////////////////////////////
   // End of deprecated methods
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
new file mode 100644
index 0000000000000..14fbebb45f8b7
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import java.util.concurrent.locks.ReentrantReadWriteLock
+import scala.collection.mutable.ListBuffer
+
+import org.apache.spark.annotation.{DeveloperApi, Experimental}
+import org.apache.spark.Logging
+import org.apache.spark.sql.execution.QueryExecution
+
+
+/**
+ * The interface of query execution listener that can be used to analyze execution metrics.
+ *
+ * Note that implementations should guarantee thread-safety as they will be used in a non
+ * thread-safe way.
+ */
+@Experimental
+trait QueryExecutionListener {
+
+  /**
+   * A callback function that will be called when a query executed successfully.
+   * Implementations should guarantee thread-safe.
+   *
+   * @param funcName the name of the action that triggered this query.
+   * @param qe the QueryExecution object that carries detail information like logical plan,
+   *           physical plan, etc.
+   * @param duration the execution time for this query in nanoseconds.
+   */
+  @DeveloperApi
+  def onSuccess(funcName: String, qe: QueryExecution, duration: Long)
+
+  /**
+   * A callback function that will be called when a query execution failed.
+   * Implementations should guarantee thread-safe.
+   *
+   * @param funcName the name of the action that triggered this query.
+   * @param qe the QueryExecution object that carries detail information like logical plan,
+   *           physical plan, etc.
+   * @param exception the exception that failed this query.
+   */
+  @DeveloperApi
+  def onFailure(funcName: String, qe: QueryExecution, exception: Exception)
+}
+
+@Experimental
+class ExecutionListenerManager extends Logging {
+  private[this] val listeners = ListBuffer.empty[QueryExecutionListener]
+  private[this] val lock = new ReentrantReadWriteLock()
+
+  /** Acquires a read lock on the cache for the duration of `f`. */
+  private def readLock[A](f: => A): A = {
+    val rl = lock.readLock()
+    rl.lock()
+    try f finally {
+      rl.unlock()
+    }
+  }
+
+  /** Acquires a write lock on the cache for the duration of `f`. */
+  private def writeLock[A](f: => A): A = {
+    val wl = lock.writeLock()
+    wl.lock()
+    try f finally {
+      wl.unlock()
+    }
+  }
+
+  /**
+   * Registers the specified QueryExecutionListener.
+   */
+  @DeveloperApi
+  def register(listener: QueryExecutionListener): Unit = writeLock {
+    listeners += listener
+  }
+
+  /**
+   * Unregisters the specified QueryExecutionListener.
+   */
+  @DeveloperApi
+  def unregister(listener: QueryExecutionListener): Unit = writeLock {
+    listeners -= listener
+  }
+
+  /**
+   * clears out all registered QueryExecutionListeners.
+   */
+  @DeveloperApi
+  def clear(): Unit = writeLock {
+    listeners.clear()
+  }
+
+  private[sql] def onSuccess(
+      funcName: String,
+      qe: QueryExecution,
+      duration: Long): Unit = readLock {
+    withErrorHandling { listener =>
+      listener.onSuccess(funcName, qe, duration)
+    }
+  }
+
+  private[sql] def onFailure(
+      funcName: String,
+      qe: QueryExecution,
+      exception: Exception): Unit = readLock {
+    withErrorHandling { listener =>
+      listener.onFailure(funcName, qe, exception)
+    }
+  }
+
+  private def withErrorHandling(f: QueryExecutionListener => Unit): Unit = {
+    for (listener <- listeners) {
+      try {
+        f(listener)
+      } catch {
+        case e: Exception => logWarning("error executing query execution listener", e)
+      }
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index cd937257d31a8..a835408f8af3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -177,6 +177,9 @@ class SQLContext private[sql](
    */
   def getAllConfs: immutable.Map[String, String] = conf.getAllConfs
 
+  @transient
+  lazy val listenerManager: ExecutionListenerManager = new ExecutionListenerManager
+
   @transient
   protected[sql] lazy val catalog: Catalog = new SimpleCatalog(conf)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
new file mode 100644
index 0000000000000..4e286a0076205
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import org.apache.spark.SparkException
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
+import org.apache.spark.sql.execution.QueryExecution
+import org.apache.spark.sql.test.SharedSQLContext
+
+import scala.collection.mutable.ArrayBuffer
+
+class DataFrameCallbackSuite extends QueryTest with SharedSQLContext {
+  import testImplicits._
+  import functions._
+
+  test("execute callback functions when a DataFrame action finished successfully") {
+    val metrics = ArrayBuffer.empty[(String, QueryExecution, Long)]
+    val listener = new QueryExecutionListener {
+      // Only test successful case here, so no need to implement `onFailure`
+      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {}
+
+      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+        metrics += ((funcName, qe, duration))
+      }
+    }
+    sqlContext.listenerManager.register(listener)
+
+    val df = Seq(1 -> "a").toDF("i", "j")
+    df.select("i").collect()
+    df.filter($"i" > 0).count()
+
+    assert(metrics.length == 2)
+
+    assert(metrics(0)._1 == "collect")
+    assert(metrics(0)._2.analyzed.isInstanceOf[Project])
+    assert(metrics(0)._3 > 0)
+
+    assert(metrics(1)._1 == "count")
+    assert(metrics(1)._2.analyzed.isInstanceOf[Aggregate])
+    assert(metrics(1)._3 > 0)
+  }
+
+  test("execute callback functions when a DataFrame action failed") {
+    val metrics = ArrayBuffer.empty[(String, QueryExecution, Exception)]
+    val listener = new QueryExecutionListener {
+      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
+        metrics += ((funcName, qe, exception))
+      }
+
+      // Only test failed case here, so no need to implement `onSuccess`
+      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {}
+    }
+    sqlContext.listenerManager.register(listener)
+
+    val errorUdf = udf[Int, Int] { _ => throw new RuntimeException("udf error") }
+    val df = sparkContext.makeRDD(Seq(1 -> "a")).toDF("i", "j")
+
+    // Ignore the log when we are expecting an exception.
+    sparkContext.setLogLevel("FATAL")
+    val e = intercept[SparkException](df.select(errorUdf($"i")).collect())
+
+    assert(metrics.length == 1)
+    assert(metrics(0)._1 == "collect")
+    assert(metrics(0)._2.analyzed.isInstanceOf[Project])
+    assert(metrics(0)._3.getMessage == e.getMessage)
+  }
+}

From ce3f9a80657751ee0bc0ed6a9b6558acbb40af4f Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Tue, 13 Oct 2015 18:21:24 -0700
Subject: [PATCH 047/139] [SPARK-11091] [SQL] Change spark.sql.canonicalizeView
 to spark.sql.nativeView.

https://issues.apache.org/jira/browse/SPARK-11091

Author: Yin Huai <yhuai@databricks.com>

Closes #9103 from yhuai/SPARK-11091.
---
 .../main/scala/org/apache/spark/sql/SQLConf.scala  |  4 ++--
 .../spark/sql/hive/HiveMetastoreCatalog.scala      |  2 +-
 .../scala/org/apache/spark/sql/hive/HiveQl.scala   |  2 +-
 .../spark/sql/hive/execution/SQLQuerySuite.scala   | 14 +++++++-------
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index f62df9bdebcc0..b08cc8e830737 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -328,7 +328,7 @@ private[spark] object SQLConf {
     doc = "When true, some predicates will be pushed down into the Hive metastore so that " +
           "unmatching partitions can be eliminated earlier.")
 
-  val CANONICALIZE_VIEW = booleanConf("spark.sql.canonicalizeView",
+  val NATIVE_VIEW = booleanConf("spark.sql.nativeView",
     defaultValue = Some(false),
     doc = "When true, CREATE VIEW will be handled by Spark SQL instead of Hive native commands.  " +
           "Note that this function is experimental and should ony be used when you are using " +
@@ -489,7 +489,7 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def metastorePartitionPruning: Boolean = getConf(HIVE_METASTORE_PARTITION_PRUNING)
 
-  private[spark] def canonicalizeView: Boolean = getConf(CANONICALIZE_VIEW)
+  private[spark] def nativeView: Boolean = getConf(NATIVE_VIEW)
 
   private[spark] def sortMergeJoinEnabled: Boolean = getConf(SORTMERGE_JOIN)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index cf59bc0d590b0..1f8223e1ff507 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -591,7 +591,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       case p: LogicalPlan if p.resolved => p
 
       case CreateViewAsSelect(table, child, allowExisting, replace, sql) =>
-        if (conf.canonicalizeView) {
+        if (conf.nativeView) {
           if (allowExisting && replace) {
             throw new AnalysisException(
               "It is not allowed to define a view with both IF NOT EXISTS and OR REPLACE.")
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 250c232856885..1d505019400bc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -537,7 +537,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       serde = None,
       viewText = Some(originalText))
 
-    // We need to keep the original SQL string so that if `spark.sql.canonicalizeView` is
+    // We need to keep the original SQL string so that if `spark.sql.nativeView` is
     // false, we can fall back to use hive native command later.
     // We can remove this when parser is configurable(can access SQLConf) in the future.
     val sql = context.getTokenRewriteStream
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 51b63f3688783..6aa34605b05a8 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1282,7 +1282,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly parse CREATE VIEW statement") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt") {
         val df = (1 until 10).map(i => i -> i).toDF("i", "j")
         df.write.format("json").saveAsTable("jt")
@@ -1299,7 +1299,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly handle CREATE VIEW IF NOT EXISTS") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt", "jt2") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE VIEW testView AS SELECT id FROM jt")
@@ -1316,7 +1316,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly handle CREATE OR REPLACE VIEW") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt", "jt2") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE OR REPLACE VIEW testView AS SELECT id FROM jt")
@@ -1339,7 +1339,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
   }
 
   test("correctly handle ALTER VIEW") {
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt", "jt2") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE VIEW testView AS SELECT id FROM jt")
@@ -1357,7 +1357,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("create hive view for json table") {
     // json table is not hive-compatible, make sure the new flag fix it.
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt") {
         sqlContext.range(1, 10).write.format("json").saveAsTable("jt")
         sql("CREATE VIEW testView AS SELECT id FROM jt")
@@ -1369,7 +1369,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("create hive view for partitioned parquet table") {
     // partitioned parquet table is not hive-compatible, make sure the new flag fix it.
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("parTable") {
         val df = Seq(1 -> "a").toDF("i", "j")
         df.write.format("parquet").partitionBy("i").saveAsTable("parTable")
@@ -1382,7 +1382,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("create hive view for joined tables") {
     // make sure the new flag can handle some complex cases like join and schema change.
-    withSQLConf(SQLConf.CANONICALIZE_VIEW.key -> "true") {
+    withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt1", "jt2") {
         sqlContext.range(1, 10).toDF("id1").write.format("json").saveAsTable("jt1")
         sqlContext.range(1, 10).toDF("id2").write.format("json").saveAsTable("jt2")

From 8b32885704502ab2a715cf5142d7517181074428 Mon Sep 17 00:00:00 2001
From: Monica Liu <liu.monica.f@gmail.com>
Date: Tue, 13 Oct 2015 22:24:52 -0700
Subject: [PATCH 048/139] [SPARK-10981] [SPARKR] SparkR Join improvements

I was having issues with collect() and orderBy() in Spark 1.5.0 so I used the DataFrame.R file and test_sparkSQL.R file from the Spark 1.5.1 download. I only modified the join() function in DataFrame.R to include "full", "fullouter", "left", "right", and "leftsemi" and added corresponding test cases in the test for join() and merge() in test_sparkSQL.R file.
Pull request because I filed this JIRA bug report:
https://issues.apache.org/jira/browse/SPARK-10981

Author: Monica Liu <liu.monica.f@gmail.com>

Closes #9029 from mfliu/master.
---
 R/pkg/R/DataFrame.R              | 13 +++++++++----
 R/pkg/inst/tests/test_sparkSQL.R | 27 +++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index e0ce056243585..b7f5f978ebc2c 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1414,9 +1414,10 @@ setMethod("where",
 #' @param x A Spark DataFrame
 #' @param y A Spark DataFrame
 #' @param joinExpr (Optional) The expression used to perform the join. joinExpr must be a
-#' Column expression. If joinExpr is omitted, join() wil perform a Cartesian join
+#' Column expression. If joinExpr is omitted, join() will perform a Cartesian join
 #' @param joinType The type of join to perform. The following join types are available:
-#' 'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'. The default joinType is "inner".
+#' 'inner', 'outer', 'full', 'fullouter', leftouter', 'left_outer', 'left',
+#' 'right_outer', 'rightouter', 'right', and 'leftsemi'. The default joinType is "inner".
 #' @return A DataFrame containing the result of the join operation.
 #' @rdname join
 #' @name join
@@ -1441,11 +1442,15 @@ setMethod("join",
               if (is.null(joinType)) {
                 sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc)
               } else {
-                if (joinType %in% c("inner", "outer", "left_outer", "right_outer", "semijoin")) {
+                if (joinType %in% c("inner", "outer", "full", "fullouter",
+                    "leftouter", "left_outer", "left",
+                    "rightouter", "right_outer", "right", "leftsemi")) {
+                  joinType <- gsub("_", "", joinType)
                   sdf <- callJMethod(x@sdf, "join", y@sdf, joinExpr@jc, joinType)
                 } else {
                   stop("joinType must be one of the following types: ",
-                       "'inner', 'outer', 'left_outer', 'right_outer', 'semijoin'")
+                      "'inner', 'outer', 'full', 'fullouter', 'leftouter', 'left_outer', 'left',
+                      'rightouter', 'right_outer', 'right', 'leftsemi'")
                 }
               }
             }
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index d5509e475de05..46cab7646dcf9 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1071,7 +1071,7 @@ test_that("join() and merge() on a DataFrame", {
   expect_equal(names(joined2), c("age", "name", "name", "test"))
   expect_equal(count(joined2), 3)
 
-  joined3 <- join(df, df2, df$name == df2$name, "right_outer")
+  joined3 <- join(df, df2, df$name == df2$name, "rightouter")
   expect_equal(names(joined3), c("age", "name", "name", "test"))
   expect_equal(count(joined3), 4)
   expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2]))
@@ -1082,11 +1082,34 @@ test_that("join() and merge() on a DataFrame", {
   expect_equal(count(joined4), 4)
   expect_equal(collect(orderBy(joined4, joined4$name))$newAge[3], 24)
 
+  joined5 <- join(df, df2, df$name == df2$name, "leftouter")
+  expect_equal(names(joined5), c("age", "name", "name", "test"))
+  expect_equal(count(joined5), 3)
+  expect_true(is.na(collect(orderBy(joined5, joined5$age))$age[1]))
+
+  joined6 <- join(df, df2, df$name == df2$name, "inner")
+  expect_equal(names(joined6), c("age", "name", "name", "test"))
+  expect_equal(count(joined6), 3)
+
+  joined7 <- join(df, df2, df$name == df2$name, "leftsemi")
+  expect_equal(names(joined7), c("age", "name"))
+  expect_equal(count(joined7), 3)
+
+  joined8 <- join(df, df2, df$name == df2$name, "left_outer")
+  expect_equal(names(joined8), c("age", "name", "name", "test"))
+  expect_equal(count(joined8), 3)
+  expect_true(is.na(collect(orderBy(joined8, joined8$age))$age[1]))
+
+  joined9 <- join(df, df2, df$name == df2$name, "right_outer")
+  expect_equal(names(joined9), c("age", "name", "name", "test"))
+  expect_equal(count(joined9), 4)
+  expect_true(is.na(collect(orderBy(joined9, joined9$age))$age[2]))
+
   merged <- select(merge(df, df2, df$name == df2$name, "outer"),
                    alias(df$age + 5, "newAge"), df$name, df2$test)
   expect_equal(names(merged), c("newAge", "name", "test"))
   expect_equal(count(merged), 4)
-  expect_equal(collect(orderBy(merged, joined4$name))$newAge[3], 24)
+  expect_equal(collect(orderBy(merged, merged$name))$newAge[3], 24)
 })
 
 test_that("toJSON() returns an RDD of the correct values", {

From 390b22fad69a33eb6daee25b6b858a2e768670a5 Mon Sep 17 00:00:00 2001
From: Sun Rui <rui.sun@intel.com>
Date: Tue, 13 Oct 2015 22:31:23 -0700
Subject: [PATCH 049/139] [SPARK-10996] [SPARKR] Implement sampleBy() in
 DataFrameStatFunctions.

Author: Sun Rui <rui.sun@intel.com>

Closes #9023 from sun-rui/SPARK-10996.
---
 R/pkg/NAMESPACE                  |  3 ++-
 R/pkg/R/DataFrame.R              | 14 ++++++--------
 R/pkg/R/generics.R               |  6 +++++-
 R/pkg/R/sparkR.R                 | 12 +++---------
 R/pkg/R/stats.R                  | 32 ++++++++++++++++++++++++++++++++
 R/pkg/R/utils.R                  | 18 ++++++++++++++++++
 R/pkg/inst/tests/test_sparkSQL.R | 10 ++++++++++
 7 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index ed9cd94e03b13..52f7a0106aae6 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -65,6 +65,7 @@ exportMethods("arrange",
               "repartition",
               "sample",
               "sample_frac",
+              "sampleBy",
               "saveAsParquetFile",
               "saveAsTable",
               "saveDF",
@@ -254,4 +255,4 @@ export("structField",
        "structType.structField",
        "print.structType")
 
-export("as.data.frame")
\ No newline at end of file
+export("as.data.frame")
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index b7f5f978ebc2c..993be82a47f75 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1831,17 +1831,15 @@ setMethod("fillna",
               if (length(colNames) == 0 || !all(colNames != "")) {
                 stop("value should be an a named list with each name being a column name.")
               }
-
-              # Convert to the named list to an environment to be passed to JVM
-              valueMap <- new.env()
-              for (col in colNames) {
-                # Check each item in the named list is of valid type
-                v <- value[[col]]
+              # Check each item in the named list is of valid type
+              lapply(value, function(v) {
                 if (!(class(v) %in% c("integer", "numeric", "character"))) {
                   stop("Each item in value should be an integer, numeric or charactor.")
                 }
-                valueMap[[col]] <- v
-              }
+              })
+
+              # Convert to the named list to an environment to be passed to JVM
+              valueMap <- convertNamedListToEnv(value)
 
               # When value is a named list, caller is expected not to pass in cols
               if (!is.null(cols)) {
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index c106a0024583e..4a419f785e92c 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -509,6 +509,10 @@ setGeneric("sample",
 setGeneric("sample_frac",
            function(x, withReplacement, fraction, seed) { standardGeneric("sample_frac") })
 
+#' @rdname statfunctions
+#' @export
+setGeneric("sampleBy", function(x, col, fractions, seed) { standardGeneric("sampleBy") })
+
 #' @rdname saveAsParquetFile
 #' @export
 setGeneric("saveAsParquetFile", function(x, path) { standardGeneric("saveAsParquetFile") })
@@ -1006,4 +1010,4 @@ setGeneric("as.data.frame")
 
 #' @rdname attach
 #' @export
-setGeneric("attach")
\ No newline at end of file
+setGeneric("attach")
diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R
index cc47110f54732..9cf2f1a361cf2 100644
--- a/R/pkg/R/sparkR.R
+++ b/R/pkg/R/sparkR.R
@@ -163,19 +163,13 @@ sparkR.init <- function(
     sparkHome <- suppressWarnings(normalizePath(sparkHome))
   }
 
-  sparkEnvirMap <- new.env()
-  for (varname in names(sparkEnvir)) {
-    sparkEnvirMap[[varname]] <- sparkEnvir[[varname]]
-  }
+  sparkEnvirMap <- convertNamedListToEnv(sparkEnvir)
 
-  sparkExecutorEnvMap <- new.env()
-  if (!any(names(sparkExecutorEnv) == "LD_LIBRARY_PATH")) {
+  sparkExecutorEnvMap <- convertNamedListToEnv(sparkExecutorEnv)
+  if(is.null(sparkExecutorEnvMap$LD_LIBRARY_PATH)) {
     sparkExecutorEnvMap[["LD_LIBRARY_PATH"]] <-
       paste0("$LD_LIBRARY_PATH:",Sys.getenv("LD_LIBRARY_PATH"))
   }
-  for (varname in names(sparkExecutorEnv)) {
-    sparkExecutorEnvMap[[varname]] <- sparkExecutorEnv[[varname]]
-  }
 
   nonEmptyJars <- Filter(function(x) { x != "" }, jars)
   localJarPaths <- lapply(nonEmptyJars,
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 4928cf4d4367d..f79329b115404 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -127,3 +127,35 @@ setMethod("freqItems", signature(x = "DataFrame", cols = "character"),
             sct <- callJMethod(statFunctions, "freqItems", as.list(cols), support)
             collect(dataFrame(sct))
           })
+
+#' sampleBy
+#'
+#' Returns a stratified sample without replacement based on the fraction given on each stratum.
+#' 
+#' @param x A SparkSQL DataFrame
+#' @param col column that defines strata
+#' @param fractions A named list giving sampling fraction for each stratum. If a stratum is
+#'                  not specified, we treat its fraction as zero.
+#' @param seed random seed
+#' @return A new DataFrame that represents the stratified sample
+#'
+#' @rdname statfunctions
+#' @name sampleBy
+#' @export
+#' @examples
+#'\dontrun{
+#' df <- jsonFile(sqlContext, "/path/to/file.json")
+#' sample <- sampleBy(df, "key", fractions, 36)
+#' }
+setMethod("sampleBy",
+          signature(x = "DataFrame", col = "character",
+                    fractions = "list", seed = "numeric"),
+          function(x, col, fractions, seed) {
+            fractionsEnv <- convertNamedListToEnv(fractions)
+
+            statFunctions <- callJMethod(x@sdf, "stat")
+            # Seed is expected to be Long on Scala side, here convert it to an integer
+            # due to SerDe limitation now.
+            sdf <- callJMethod(statFunctions, "sampleBy", col, fractionsEnv, as.integer(seed))
+            dataFrame(sdf)
+          })
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 94f16c7ac52cc..0b9e2957fe9a5 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -605,3 +605,21 @@ structToList <- function(struct) {
   class(struct) <- "list"
   struct
 }
+
+# Convert a named list to an environment to be passed to JVM
+convertNamedListToEnv <- function(namedList) {
+  # Make sure each item in the list has a name
+  names <- names(namedList)
+  stopifnot(
+    if (is.null(names)) {
+      length(namedList) == 0
+    } else {
+      !any(is.na(names))
+    })
+
+  env <- new.env()
+  for (name in names) {
+    env[[name]] <- namedList[[name]]
+  }
+  env
+}
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index 46cab7646dcf9..e1b42b0804933 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1416,6 +1416,16 @@ test_that("freqItems() on a DataFrame", {
   expect_identical(result[[2]], list(list(-1, -99)))
 })
 
+test_that("sampleBy() on a DataFrame", {
+  l <- lapply(c(0:99), function(i) { as.character(i %% 3) })
+  df <- createDataFrame(sqlContext, l, "key")
+  fractions <- list("0" = 0.1, "1" = 0.2)
+  sample <- sampleBy(df, "key", fractions, 0)
+  result <- collect(orderBy(count(groupBy(sample, "key")), "key"))
+  expect_identical(as.list(result[1, ]), list(key = "0", count = 2))
+  expect_identical(as.list(result[2, ]), list(key = "1", count = 10))
+})
+
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
   expect_equal(grepl("Table Not Found: blah", retError), TRUE)

From 135a2ce5b0b927b512c832d61c25e7b9d57e30be Mon Sep 17 00:00:00 2001
From: Tom Graves <tgraves@yahoo-inc.com>
Date: Wed, 14 Oct 2015 10:12:25 -0700
Subject: [PATCH 050/139] [SPARK-10619] Can't sort columns on Executor Page

should pick into spark 1.5.2 also.

https://issues.apache.org/jira/browse/SPARK-10619

looks like this was broken by commit: https://github.com/apache/spark/commit/fb1d06fc242ec00320f1a3049673fbb03c4a6eb9#diff-b8adb646ef90f616c34eb5c98d1ebd16
It looks like somethings were change to use the UIUtils.listingTable but executor page wasn't converted so when it removed sortable from the UIUtils. TABLE_CLASS_NOT_STRIPED it broke this page.

Simply add the sortable tag back in and it fixes both active UI and the history server UI.

Author: Tom Graves <tgraves@yahoo-inc.com>

Closes #9101 from tgravescs/SPARK-10619.
---
 core/src/main/scala/org/apache/spark/ui/UIUtils.scala           | 1 +
 .../src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala | 2 +-
 .../src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala | 2 +-
 .../main/scala/org/apache/spark/streaming/ui/BatchPage.scala    | 2 +-
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 21dc8f0b65485..68a9f912a5d2c 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -31,6 +31,7 @@ import org.apache.spark.ui.scope.RDDOperationGraph
 private[spark] object UIUtils extends Logging {
   val TABLE_CLASS_NOT_STRIPED = "table table-bordered table-condensed"
   val TABLE_CLASS_STRIPED = TABLE_CLASS_NOT_STRIPED + " table-striped"
+  val TABLE_CLASS_STRIPED_SORTABLE = TABLE_CLASS_STRIPED + " sortable"
 
   // SimpleDateFormat is not thread-safe. Don't expose it to avoid improper use.
   private val dateFormat = new ThreadLocal[SimpleDateFormat]() {
diff --git a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
index 01cddda4c62cd..1a29b0f412603 100644
--- a/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/exec/ExecutorsPage.scala
@@ -62,7 +62,7 @@ private[ui] class ExecutorsPage(
     val logsExist = execInfo.filter(_.executorLogs.nonEmpty).nonEmpty
 
     val execTable =
-      <table class={UIUtils.TABLE_CLASS_STRIPED}>
+      <table class={UIUtils.TABLE_CLASS_STRIPED_SORTABLE}>
         <thead>
           <th>Executor ID</th>
           <th>Address</th>
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
index d5cdbfac104f8..be144f6065baa 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala
@@ -50,7 +50,7 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: Stage
         hasBytesSpilled = data.hasBytesSpilled
     })
 
-    <table class={UIUtils.TABLE_CLASS_STRIPED}>
+    <table class={UIUtils.TABLE_CLASS_STRIPED_SORTABLE}>
       <thead>
         <th>Executor ID</th>
         <th>Address</th>
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index 1b717b64542d5..a19b85a51d289 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -443,7 +443,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   def generateInputMetadataTable(inputMetadatas: Seq[(Int, String)]): Seq[Node] = {
-    <table class={SparkUIUtils.TABLE_CLASS_STRIPED}>
+    <table class={SparkUIUtils.TABLE_CLASS_STRIPED_SORTABLE}>
       <thead>
         <tr>
           <th>Input</th>

From 31f315981709251d5d26c508a3dc62cf0e6f87e1 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Wed, 14 Oct 2015 10:25:09 -0700
Subject: [PATCH 051/139] [SPARK-11040] [NETWORK] Make sure SASL handler
 delegates all events.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9053 from vanzin/SPARK-11040.
---
 .../spark/network/sasl/SaslRpcHandler.java    | 13 +++++++++++--
 .../server/TransportRequestHandler.java       |  8 +++++++-
 .../spark/network/sasl/SparkSaslSuite.java    | 19 +++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
index 3f2ebe32887b8..7033adb9cae6f 100644
--- a/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/sasl/SaslRpcHandler.java
@@ -115,9 +115,18 @@ public StreamManager getStreamManager() {
 
   @Override
   public void connectionTerminated(TransportClient client) {
-    if (saslServer != null) {
-      saslServer.dispose();
+    try {
+      delegate.connectionTerminated(client);
+    } finally {
+      if (saslServer != null) {
+        saslServer.dispose();
+      }
     }
   }
 
+  @Override
+  public void exceptionCaught(Throwable cause, TransportClient client) {
+    delegate.exceptionCaught(cause, client);
+  }
+
 }
diff --git a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
index 96941d26be19d..9b8b047b49a86 100644
--- a/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
+++ b/network/common/src/main/java/org/apache/spark/network/server/TransportRequestHandler.java
@@ -76,7 +76,13 @@ public void exceptionCaught(Throwable cause) {
 
   @Override
   public void channelUnregistered() {
-    streamManager.connectionTerminated(channel);
+    if (streamManager != null) {
+      try {
+        streamManager.connectionTerminated(channel);
+      } catch (RuntimeException e) {
+        logger.error("StreamManager connectionTerminated() callback failed.", e);
+      }
+    }
     rpcHandler.connectionTerminated(reverseClient);
   }
 
diff --git a/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java b/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
index 8104004847a24..3469e84e7f4da 100644
--- a/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
+++ b/network/common/src/test/java/org/apache/spark/network/sasl/SparkSaslSuite.java
@@ -153,6 +153,8 @@ public Void answer(InvocationOnMock invocation) {
       assertEquals("Pong", new String(response, StandardCharsets.UTF_8));
     } finally {
       ctx.close();
+      // There should be 2 terminated events; one for the client, one for the server.
+      verify(rpcHandler, times(2)).connectionTerminated(any(TransportClient.class));
     }
   }
 
@@ -334,6 +336,23 @@ public void testDataEncryptionIsActuallyEnabled() throws Exception {
     }
   }
 
+  @Test
+  public void testRpcHandlerDelegate() throws Exception {
+    // Tests all delegates exception for receive(), which is more complicated and already handled
+    // by all other tests.
+    RpcHandler handler = mock(RpcHandler.class);
+    RpcHandler saslHandler = new SaslRpcHandler(null, null, handler, null);
+
+    saslHandler.getStreamManager();
+    verify(handler).getStreamManager();
+
+    saslHandler.connectionTerminated(null);
+    verify(handler).connectionTerminated(any(TransportClient.class));
+
+    saslHandler.exceptionCaught(null, null);
+    verify(handler).exceptionCaught(any(Throwable.class), any(TransportClient.class));
+  }
+
   private static class SaslTestCtx {
 
     final TransportClient client;

From 7e1308d37f6ca35f063e67e4b87a77e932ad89a5 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxing@us.ibm.com>
Date: Wed, 14 Oct 2015 12:31:29 -0700
Subject: [PATCH 052/139] [SPARK-8386] [SQL] add write.mode for insertIntoJDBC
 when the parm overwrite is false

the fix is for jira https://issues.apache.org/jira/browse/SPARK-8386

Author: Huaxin Gao <huaxing@us.ibm.com>

Closes #9042 from huaxingao/spark8386.
---
 sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
index bfe8d3c8ef957..174bc6f42ad8d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala
@@ -1674,7 +1674,7 @@ class DataFrame private[sql](
    */
   @deprecated("Use write.jdbc()", "1.4.0")
   def insertIntoJDBC(url: String, table: String, overwrite: Boolean): Unit = {
-    val w = if (overwrite) write.mode(SaveMode.Overwrite) else write
+    val w = if (overwrite) write.mode(SaveMode.Overwrite) else write.mode(SaveMode.Append)
     w.jdbc(url, table, new Properties)
   }
 

From 615cc858cf913522059b6ebdde65f0204f4fb030 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Oct 2015 12:36:31 -0700
Subject: [PATCH 053/139] [SPARK-10973]

Close #9064
Close #9063
Close #9062

These pull requests were merged into branch-1.5, branch-1.4, and branch-1.3.

From cf2e0ae7205443f052463e8cb9334ae2b6df2d0e Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Oct 2015 12:41:02 -0700
Subject: [PATCH 054/139] [SPARK-11096] Post-hoc review Netty based RPC
 implementation - round 2

A few more changes:

1. Renamed IDVerifier -> RpcEndpointVerifier
2. Renamed NettyRpcAddress -> RpcEndpointAddress
3. Simplified NettyRpcHandler a bit by removing the connection count tracking. This is OK because I now force spark.shuffle.io.numConnectionsPerPeer to 1
4. Reduced spark.rpc.connect.threads to 64. It would be great to eventually remove this extra thread pool.
5. Minor cleanup & documentation.

Author: Reynold Xin <rxin@databricks.com>

Closes #9112 from rxin/SPARK-11096.
---
 .../scala/org/apache/spark/rpc/RpcEnv.scala   |   9 --
 .../apache/spark/rpc/netty/Dispatcher.scala   |   7 +-
 .../apache/spark/rpc/netty/NettyRpcEnv.scala  | 114 +++++++-----------
 ...Address.scala => RpcEndpointAddress.scala} |  32 ++---
 ...rifier.scala => RpcEndpointVerifier.scala} |  21 ++--
 .../rpc/netty/NettyRpcAddressSuite.scala      |   2 +-
 .../rpc/netty/NettyRpcHandlerSuite.scala      |   3 -
 7 files changed, 81 insertions(+), 107 deletions(-)
 rename core/src/main/scala/org/apache/spark/rpc/netty/{NettyRpcAddress.scala => RpcEndpointAddress.scala} (65%)
 rename core/src/main/scala/org/apache/spark/rpc/netty/{IDVerifier.scala => RpcEndpointVerifier.scala} (65%)

diff --git a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
index ef491a0ae4f09..2c4a8b9a0a878 100644
--- a/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/RpcEnv.scala
@@ -93,15 +93,6 @@ private[spark] abstract class RpcEnv(conf: SparkConf) {
     defaultLookupTimeout.awaitResult(asyncSetupEndpointRefByURI(uri))
   }
 
-  /**
-   * Retrieve the [[RpcEndpointRef]] represented by `systemName`, `address` and `endpointName`
-   * asynchronously.
-   */
-  def asyncSetupEndpointRef(
-      systemName: String, address: RpcAddress, endpointName: String): Future[RpcEndpointRef] = {
-    asyncSetupEndpointRefByURI(uriOf(systemName, address, endpointName))
-  }
-
   /**
    * Retrieve the [[RpcEndpointRef]] represented by `systemName`, `address` and `endpointName`.
    * This is a blocking action.
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
index 398e9eafc1444..f1a8273f157ef 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/Dispatcher.scala
@@ -29,6 +29,9 @@ import org.apache.spark.network.client.RpcResponseCallback
 import org.apache.spark.rpc._
 import org.apache.spark.util.ThreadUtils
 
+/**
+ * A message dispatcher, responsible for routing RPC messages to the appropriate endpoint(s).
+ */
 private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
 
   private class EndpointData(
@@ -42,7 +45,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   private val endpointRefs = new ConcurrentHashMap[RpcEndpoint, RpcEndpointRef]
 
   // Track the receivers whose inboxes may contain messages.
-  private val receivers = new LinkedBlockingQueue[EndpointData]()
+  private val receivers = new LinkedBlockingQueue[EndpointData]
 
   /**
    * True if the dispatcher has been stopped. Once stopped, all messages posted will be bounced
@@ -52,7 +55,7 @@ private[netty] class Dispatcher(nettyEnv: NettyRpcEnv) extends Logging {
   private var stopped = false
 
   def registerRpcEndpoint(name: String, endpoint: RpcEndpoint): NettyRpcEndpointRef = {
-    val addr = new NettyRpcAddress(nettyEnv.address.host, nettyEnv.address.port, name)
+    val addr = new RpcEndpointAddress(nettyEnv.address.host, nettyEnv.address.port, name)
     val endpointRef = new NettyRpcEndpointRef(nettyEnv.conf, addr, nettyEnv)
     synchronized {
       if (stopped) {
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index 89b6df76c2707..a2b28c524df9c 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -22,7 +22,6 @@ import java.nio.ByteBuffer
 import java.util.concurrent._
 import javax.annotation.concurrent.GuardedBy
 
-import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.concurrent.{Future, Promise}
 import scala.reflect.ClassTag
@@ -45,8 +44,10 @@ private[netty] class NettyRpcEnv(
     host: String,
     securityManager: SecurityManager) extends RpcEnv(conf) with Logging {
 
-  private val transportConf =
-    SparkTransportConf.fromSparkConf(conf, conf.getInt("spark.rpc.io.threads", 0))
+  // Override numConnectionsPerPeer to 1 for RPC.
+  private val transportConf = SparkTransportConf.fromSparkConf(
+    conf.clone.set("spark.shuffle.io.numConnectionsPerPeer", "1"),
+    conf.getInt("spark.rpc.io.threads", 0))
 
   private val dispatcher: Dispatcher = new Dispatcher(this)
 
@@ -54,14 +55,14 @@ private[netty] class NettyRpcEnv(
     new TransportContext(transportConf, new NettyRpcHandler(dispatcher, this))
 
   private val clientFactory = {
-    val bootstraps: Seq[TransportClientBootstrap] =
+    val bootstraps: java.util.List[TransportClientBootstrap] =
       if (securityManager.isAuthenticationEnabled()) {
-        Seq(new SaslClientBootstrap(transportConf, "", securityManager,
+        java.util.Arrays.asList(new SaslClientBootstrap(transportConf, "", securityManager,
           securityManager.isSaslEncryptionEnabled()))
       } else {
-        Nil
+        java.util.Collections.emptyList[TransportClientBootstrap]
       }
-    transportContext.createClientFactory(bootstraps.asJava)
+    transportContext.createClientFactory(bootstraps)
   }
 
   val timeoutScheduler = ThreadUtils.newDaemonSingleThreadScheduledExecutor("netty-rpc-env-timeout")
@@ -71,7 +72,7 @@ private[netty] class NettyRpcEnv(
   // TODO: a non-blocking TransportClientFactory.createClient in future
   private val clientConnectionExecutor = ThreadUtils.newDaemonCachedThreadPool(
     "netty-rpc-connection",
-    conf.getInt("spark.rpc.connect.threads", 256))
+    conf.getInt("spark.rpc.connect.threads", 64))
 
   @volatile private var server: TransportServer = _
 
@@ -83,7 +84,8 @@ private[netty] class NettyRpcEnv(
         java.util.Collections.emptyList()
       }
     server = transportContext.createServer(port, bootstraps)
-    dispatcher.registerRpcEndpoint(IDVerifier.NAME, new IDVerifier(this, dispatcher))
+    dispatcher.registerRpcEndpoint(
+      RpcEndpointVerifier.NAME, new RpcEndpointVerifier(this, dispatcher))
   }
 
   override lazy val address: RpcAddress = {
@@ -96,11 +98,11 @@ private[netty] class NettyRpcEnv(
   }
 
   def asyncSetupEndpointRefByURI(uri: String): Future[RpcEndpointRef] = {
-    val addr = NettyRpcAddress(uri)
+    val addr = RpcEndpointAddress(uri)
     val endpointRef = new NettyRpcEndpointRef(conf, addr, this)
-    val idVerifierRef =
-      new NettyRpcEndpointRef(conf, NettyRpcAddress(addr.host, addr.port, IDVerifier.NAME), this)
-    idVerifierRef.ask[Boolean](ID(endpointRef.name)).flatMap { find =>
+    val verifier = new NettyRpcEndpointRef(
+      conf, RpcEndpointAddress(addr.host, addr.port, RpcEndpointVerifier.NAME), this)
+    verifier.ask[Boolean](RpcEndpointVerifier.CheckExistence(endpointRef.name)).flatMap { find =>
       if (find) {
         Future.successful(endpointRef)
       } else {
@@ -117,16 +119,18 @@ private[netty] class NettyRpcEnv(
   private[netty] def send(message: RequestMessage): Unit = {
     val remoteAddr = message.receiver.address
     if (remoteAddr == address) {
+      // Message to a local RPC endpoint.
       val promise = Promise[Any]()
       dispatcher.postLocalMessage(message, promise)
       promise.future.onComplete {
         case Success(response) =>
           val ack = response.asInstanceOf[Ack]
-          logDebug(s"Receive ack from ${ack.sender}")
+          logTrace(s"Received ack from ${ack.sender}")
         case Failure(e) =>
           logError(s"Exception when sending $message", e)
       }(ThreadUtils.sameThread)
     } else {
+      // Message to a remote RPC endpoint.
       try {
         // `createClient` will block if it cannot find a known connection, so we should run it in
         // clientConnectionExecutor
@@ -204,11 +208,10 @@ private[netty] class NettyRpcEnv(
           }
         })
       } catch {
-        case e: RejectedExecutionException => {
+        case e: RejectedExecutionException =>
           if (!promise.tryFailure(e)) {
             logWarning(s"Ignore failure", e)
           }
-        }
       }
     }
     promise.future
@@ -231,7 +234,7 @@ private[netty] class NettyRpcEnv(
   }
 
   override def uriOf(systemName: String, address: RpcAddress, endpointName: String): String =
-    new NettyRpcAddress(address.host, address.port, endpointName).toString
+    new RpcEndpointAddress(address.host, address.port, endpointName).toString
 
   override def shutdown(): Unit = {
     cleanup()
@@ -310,9 +313,9 @@ private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
 
   @transient @volatile private var nettyEnv: NettyRpcEnv = _
 
-  @transient @volatile private var _address: NettyRpcAddress = _
+  @transient @volatile private var _address: RpcEndpointAddress = _
 
-  def this(conf: SparkConf, _address: NettyRpcAddress, nettyEnv: NettyRpcEnv) {
+  def this(conf: SparkConf, _address: RpcEndpointAddress, nettyEnv: NettyRpcEnv) {
     this(conf)
     this._address = _address
     this.nettyEnv = nettyEnv
@@ -322,7 +325,7 @@ private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
 
   private def readObject(in: ObjectInputStream): Unit = {
     in.defaultReadObject()
-    _address = in.readObject().asInstanceOf[NettyRpcAddress]
+    _address = in.readObject().asInstanceOf[RpcEndpointAddress]
     nettyEnv = NettyRpcEnv.currentEnv.value
   }
 
@@ -406,49 +409,37 @@ private[netty] class NettyRpcHandler(
   private type RemoteEnvAddress = RpcAddress
 
   // Store all client addresses and their NettyRpcEnv addresses.
+  // TODO: Is this even necessary?
   @GuardedBy("this")
   private val remoteAddresses = new mutable.HashMap[ClientAddress, RemoteEnvAddress]()
 
-  // Store the connections from other NettyRpcEnv addresses. We need to keep track of the connection
-  // count because `TransportClientFactory.createClient` will create multiple connections
-  // (at most `spark.shuffle.io.numConnectionsPerPeer` connections) and randomly select a connection
-  // to send the message. See `TransportClientFactory.createClient` for more details.
-  @GuardedBy("this")
-  private val remoteConnectionCount = new mutable.HashMap[RemoteEnvAddress, Int]()
-
   override def receive(
       client: TransportClient, message: Array[Byte], callback: RpcResponseCallback): Unit = {
     val requestMessage = nettyEnv.deserialize[RequestMessage](message)
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     assert(addr != null)
     val remoteEnvAddress = requestMessage.senderAddress
     val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-    val broadcastMessage: Option[RemoteProcessConnected] =
-      synchronized {
-        // If the first connection to a remote RpcEnv is found, we should broadcast "Associated"
-        if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
-          // clientAddr connects at the first time
-          val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
-          // Increase the connection number of remoteEnvAddress
-          remoteConnectionCount.put(remoteEnvAddress, count + 1)
-          if (count == 0) {
-            // This is the first connection, so fire "Associated"
-            Some(RemoteProcessConnected(remoteEnvAddress))
-          } else {
-            None
-          }
-        } else {
-          None
-        }
+
+    // TODO: Can we add connection callback (channel registered) to the underlying framework?
+    // A variable to track whether we should dispatch the RemoteProcessConnected message.
+    var dispatchRemoteProcessConnected = false
+    synchronized {
+      if (remoteAddresses.put(clientAddr, remoteEnvAddress).isEmpty) {
+        // clientAddr connects at the first time, fire "RemoteProcessConnected"
+        dispatchRemoteProcessConnected = true
       }
-    broadcastMessage.foreach(dispatcher.postToAll)
+    }
+    if (dispatchRemoteProcessConnected) {
+      dispatcher.postToAll(RemoteProcessConnected(remoteEnvAddress))
+    }
     dispatcher.postRemoteMessage(requestMessage, callback)
   }
 
   override def getStreamManager: StreamManager = new OneForOneStreamManager
 
   override def exceptionCaught(cause: Throwable, client: TransportClient): Unit = {
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     if (addr != null) {
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
       val broadcastMessage =
@@ -469,34 +460,21 @@ private[netty] class NettyRpcHandler(
   }
 
   override def connectionTerminated(client: TransportClient): Unit = {
-    val addr = client.getChannel().remoteAddress().asInstanceOf[InetSocketAddress]
+    val addr = client.getChannel.remoteAddress().asInstanceOf[InetSocketAddress]
     if (addr != null) {
       val clientAddr = RpcAddress(addr.getHostName, addr.getPort)
-      val broadcastMessage =
-        synchronized {
-          // If the last connection to a remote RpcEnv is terminated, we should broadcast
-          // "Disassociated"
-          remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
-            remoteAddresses -= clientAddr
-            val count = remoteConnectionCount.getOrElse(remoteEnvAddress, 0)
-            assert(count != 0, "remoteAddresses and remoteConnectionCount are not consistent")
-            if (count - 1 == 0) {
-              // We lost all clients, so clean up and fire "Disassociated"
-              remoteConnectionCount.remove(remoteEnvAddress)
-              Some(RemoteProcessDisconnected(remoteEnvAddress))
-            } else {
-              // Decrease the connection number of remoteEnvAddress
-              remoteConnectionCount.put(remoteEnvAddress, count - 1)
-              None
-            }
-          }
+      val messageOpt: Option[RemoteProcessDisconnected] =
+      synchronized {
+        remoteAddresses.get(clientAddr).flatMap { remoteEnvAddress =>
+          remoteAddresses -= clientAddr
+          Some(RemoteProcessDisconnected(remoteEnvAddress))
         }
-      broadcastMessage.foreach(dispatcher.postToAll)
+      }
+      messageOpt.foreach(dispatcher.postToAll)
     } else {
       // If the channel is closed before connecting, its remoteAddress will be null. In this case,
       // we can ignore it since we don't fire "Associated".
       // See java.net.Socket.getRemoteSocketAddress
     }
   }
-
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
similarity index 65%
rename from core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
rename to core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
index 1876b25592086..87b6236936817 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcAddress.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointAddress.scala
@@ -17,40 +17,44 @@
 
 package org.apache.spark.rpc.netty
 
-import java.net.URI
-
 import org.apache.spark.SparkException
 import org.apache.spark.rpc.RpcAddress
 
-private[netty] case class NettyRpcAddress(host: String, port: Int, name: String) {
+/**
+ * An address identifier for an RPC endpoint.
+ *
+ * @param host host name of the remote process.
+ * @param port the port the remote RPC environment binds to.
+ * @param name name of the remote endpoint.
+ */
+private[netty] case class RpcEndpointAddress(host: String, port: Int, name: String) {
 
   def toRpcAddress: RpcAddress = RpcAddress(host, port)
 
   override val toString = s"spark://$name@$host:$port"
 }
 
-private[netty] object NettyRpcAddress {
+private[netty] object RpcEndpointAddress {
 
-  def apply(sparkUrl: String): NettyRpcAddress = {
+  def apply(sparkUrl: String): RpcEndpointAddress = {
     try {
-      val uri = new URI(sparkUrl)
+      val uri = new java.net.URI(sparkUrl)
       val host = uri.getHost
       val port = uri.getPort
       val name = uri.getUserInfo
       if (uri.getScheme != "spark" ||
-        host == null ||
-        port < 0 ||
-        name == null ||
-        (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
-        uri.getFragment != null ||
-        uri.getQuery != null) {
+          host == null ||
+          port < 0 ||
+          name == null ||
+          (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
+          uri.getFragment != null ||
+          uri.getQuery != null) {
         throw new SparkException("Invalid Spark URL: " + sparkUrl)
       }
-      NettyRpcAddress(host, port, name)
+      RpcEndpointAddress(host, port, name)
     } catch {
       case e: java.net.URISyntaxException =>
         throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
     }
   }
-
 }
diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
similarity index 65%
rename from core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
rename to core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
index fa9a3eb99b02a..99f20da2d66aa 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/IDVerifier.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/RpcEndpointVerifier.scala
@@ -14,26 +14,27 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.spark.rpc.netty
 
 import org.apache.spark.rpc.{RpcCallContext, RpcEndpoint, RpcEnv}
 
 /**
- * A message used to ask the remote [[IDVerifier]] if an [[RpcEndpoint]] exists
- */
-private[netty] case class ID(name: String)
-
-/**
- * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if a [[RpcEndpoint]] exists in this [[RpcEnv]]
+ * An [[RpcEndpoint]] for remote [[RpcEnv]]s to query if an [[RpcEndpoint]] exists.
+ *
+ * This is used when setting up a remote endpoint reference.
  */
-private[netty] class IDVerifier(override val rpcEnv: RpcEnv, dispatcher: Dispatcher)
+private[netty] class RpcEndpointVerifier(override val rpcEnv: RpcEnv, dispatcher: Dispatcher)
   extends RpcEndpoint {
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
-    case ID(name) => context.reply(dispatcher.verify(name))
+    case RpcEndpointVerifier.CheckExistence(name) => context.reply(dispatcher.verify(name))
   }
 }
 
-private[netty] object IDVerifier {
-  val NAME = "id-verifier"
+private[netty] object RpcEndpointVerifier {
+  val NAME = "endpoint-verifier"
+
+  /** A message used to ask the remote [[RpcEndpointVerifier]] if an [[RpcEndpoint]] exists. */
+  case class CheckExistence(name: String)
 }
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
index a5d43d3704e37..973a07a0bde3a 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcAddressSuite.scala
@@ -22,7 +22,7 @@ import org.apache.spark.SparkFunSuite
 class NettyRpcAddressSuite extends SparkFunSuite {
 
   test("toString") {
-    val addr = NettyRpcAddress("localhost", 12345, "test")
+    val addr = RpcEndpointAddress("localhost", 12345, "test")
     assert(addr.toString === "spark://test@localhost:12345")
   }
 
diff --git a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
index f24f78b8c4542..5430e4c0c4d6c 100644
--- a/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rpc/netty/NettyRpcHandlerSuite.scala
@@ -42,9 +42,6 @@ class NettyRpcHandlerSuite extends SparkFunSuite {
     when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40000))
     nettyRpcHandler.receive(client, null, null)
 
-    when(channel.remoteAddress()).thenReturn(new InetSocketAddress("localhost", 40001))
-    nettyRpcHandler.receive(client, null, null)
-
     verify(dispatcher, times(1)).postToAll(RemoteProcessConnected(RpcAddress("localhost", 12345)))
   }
 

From 9a430a027faafb083ca569698effb697af26a1db Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 14 Oct 2015 15:08:13 -0700
Subject: [PATCH 055/139] [SPARK-11068] [SQL] [FOLLOW-UP] move execution
 listener to util

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9119 from cloud-fan/callback.
---
 sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala  | 1 +
 .../apache/spark/sql/{ => util}/QueryExecutionListener.scala   | 2 +-
 .../apache/spark/sql/{ => util}/DataFrameCallbackSuite.scala   | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)
 rename sql/core/src/main/scala/org/apache/spark/sql/{ => util}/QueryExecutionListener.scala (99%)
 rename sql/core/src/test/scala/org/apache/spark/sql/{ => util}/DataFrameCallbackSuite.scala (97%)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a835408f8af3a..3d5e35ab315eb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -45,6 +45,7 @@ import org.apache.spark.sql.execution.ui.{SQLListener, SQLTab}
 import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.{execution => sparkexecution}
+import org.apache.spark.sql.util.ExecutionListenerManager
 import org.apache.spark.util.Utils
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
similarity index 99%
rename from sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
rename to sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
index 14fbebb45f8b7..909a8abd225b8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/QueryExecutionListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/util/QueryExecutionListener.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
+package org.apache.spark.sql.util
 
 import java.util.concurrent.locks.ReentrantReadWriteLock
 import scala.collection.mutable.ListBuffer
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
similarity index 97%
rename from sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
rename to sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
index 4e286a0076205..eb056cd519717 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameCallbackSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
@@ -15,9 +15,10 @@
  * limitations under the License.
  */
 
-package org.apache.spark.sql
+package org.apache.spark.sql.util
 
 import org.apache.spark.SparkException
+import org.apache.spark.sql.{functions, QueryTest}
 import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Project}
 import org.apache.spark.sql.execution.QueryExecution
 import org.apache.spark.sql.test.SharedSQLContext

From 56d7da14ab8f89bf4f303b27f51fd22d23967ffb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 14 Oct 2015 16:05:37 -0700
Subject: [PATCH 056/139] [SPARK-10104] [SQL] Consolidate different forms of
 table identifiers

Right now, we have QualifiedTableName, TableIdentifier, and Seq[String] to represent table identifiers. We should only have one form and TableIdentifier is the best one because it provides methods to get table name, database name, return unquoted string, and return quoted string.

Author: Wenchen Fan <wenchen@databricks.com>
Author: Wenchen Fan <cloud0fan@163.com>

Closes #8453 from cloud-fan/table-name.
---
 .../apache/spark/sql/catalyst/SqlParser.scala |   2 +-
 .../spark/sql/catalyst/TableIdentifier.scala  |  14 +-
 .../sql/catalyst/analysis/Analyzer.scala      |   4 +-
 .../spark/sql/catalyst/analysis/Catalog.scala | 174 ++++++------------
 .../sql/catalyst/analysis/unresolved.scala    |   6 +-
 .../spark/sql/catalyst/dsl/package.scala      |   3 +-
 .../sql/catalyst/analysis/AnalysisSuite.scala |  24 ++-
 .../sql/catalyst/analysis/AnalysisTest.scala  |  10 +-
 .../analysis/DecimalPrecisionSuite.scala      |   4 +-
 .../apache/spark/sql/DataFrameReader.scala    |   3 +-
 .../apache/spark/sql/DataFrameWriter.scala    |   4 +-
 .../org/apache/spark/sql/SQLContext.scala     |   6 +-
 .../sql/execution/datasources/DDLParser.scala |   2 +-
 .../spark/sql/execution/datasources/ddl.scala |   7 +-
 .../sql/execution/datasources/rules.scala     |   4 +-
 .../apache/spark/sql/CachedTableSuite.scala   |   7 +-
 .../org/apache/spark/sql/JoinSuite.scala      |   5 +-
 .../apache/spark/sql/ListTablesSuite.scala    |   7 +-
 .../parquet/ParquetQuerySuite.scala           |   6 +-
 .../apache/spark/sql/hive/HiveContext.scala   |   2 +-
 .../spark/sql/hive/HiveMetastoreCatalog.scala | 134 ++++----------
 .../org/apache/spark/sql/hive/HiveQl.scala    |  42 ++---
 .../hive/execution/CreateTableAsSelect.scala  |  12 +-
 .../hive/execution/CreateViewAsSelect.scala   |   9 +-
 .../spark/sql/hive/execution/commands.scala   |  10 +-
 .../apache/spark/sql/hive/test/TestHive.scala |   2 +-
 .../hive/JavaMetastoreDataSourcesSuite.java   |   6 +-
 .../spark/sql/hive/ListTablesSuite.scala      |   5 +-
 .../sql/hive/MetastoreDataSourcesSuite.scala  |   9 +-
 .../spark/sql/hive/StatisticsSuite.scala      |   5 +-
 .../sql/hive/execution/SQLQuerySuite.scala    |   6 +-
 .../spark/sql/hive/orc/OrcQuerySuite.scala    |   5 +-
 32 files changed, 212 insertions(+), 327 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index dfab2398857e8..2595e1f90c837 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -170,7 +170,7 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     joinedRelation | relationFactor
 
   protected lazy val relationFactor: Parser[LogicalPlan] =
-    ( rep1sep(ident, ".") ~ (opt(AS) ~> opt(ident)) ^^ {
+    ( tableIdentifier ~ (opt(AS) ~> opt(ident)) ^^ {
         case tableIdent ~ alias => UnresolvedRelation(tableIdent, alias)
       }
       | ("(" ~> start <~ ")") ~ (AS.? ~> ident) ^^ { case s ~ a => Subquery(a, s) }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
index d701559bf2d9b..4d4e4ded99477 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/TableIdentifier.scala
@@ -20,14 +20,16 @@ package org.apache.spark.sql.catalyst
 /**
  * Identifies a `table` in `database`.  If `database` is not defined, the current database is used.
  */
-private[sql] case class TableIdentifier(table: String, database: Option[String] = None) {
-  def withDatabase(database: String): TableIdentifier = this.copy(database = Some(database))
-
-  def toSeq: Seq[String] = database.toSeq :+ table
+private[sql] case class TableIdentifier(table: String, database: Option[String]) {
+  def this(table: String) = this(table, None)
 
   override def toString: String = quotedString
 
-  def quotedString: String = toSeq.map("`" + _ + "`").mkString(".")
+  def quotedString: String = database.map(db => s"`$db`.`$table`").getOrElse(s"`$table`")
+
+  def unquotedString: String = database.map(db => s"$db.$table").getOrElse(table)
+}
 
-  def unquotedString: String = toSeq.mkString(".")
+private[sql] object TableIdentifier {
+  def apply(tableName: String): TableIdentifier = new TableIdentifier(tableName)
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 041ab22827399..e6046055bf0f6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -105,7 +105,7 @@ class Analyzer(
         // here use the CTE definition first, check table name only and ignore database name
         // see https://github.com/apache/spark/pull/4929#discussion_r27186638 for more info
         case u : UnresolvedRelation =>
-          val substituted = cteRelations.get(u.tableIdentifier.last).map { relation =>
+          val substituted = cteRelations.get(u.tableIdentifier.table).map { relation =>
             val withAlias = u.alias.map(Subquery(_, relation))
             withAlias.getOrElse(relation)
           }
@@ -257,7 +257,7 @@ class Analyzer(
         catalog.lookupRelation(u.tableIdentifier, u.alias)
       } catch {
         case _: NoSuchTableException =>
-          u.failAnalysis(s"no such table ${u.tableName}")
+          u.failAnalysis(s"Table Not Found: ${u.tableName}")
       }
     }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
index 4cc9a5520a085..8f4ce74a2ea38 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala
@@ -42,11 +42,9 @@ trait Catalog {
 
   val conf: CatalystConf
 
-  def tableExists(tableIdentifier: Seq[String]): Boolean
+  def tableExists(tableIdent: TableIdentifier): Boolean
 
-  def lookupRelation(
-      tableIdentifier: Seq[String],
-      alias: Option[String] = None): LogicalPlan
+  def lookupRelation(tableIdent: TableIdentifier, alias: Option[String] = None): LogicalPlan
 
   /**
    * Returns tuples of (tableName, isTemporary) for all tables in the given database.
@@ -56,89 +54,59 @@ trait Catalog {
 
   def refreshTable(tableIdent: TableIdentifier): Unit
 
-  // TODO: Refactor it in the work of SPARK-10104
-  def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit
+  def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit
 
-  // TODO: Refactor it in the work of SPARK-10104
-  def unregisterTable(tableIdentifier: Seq[String]): Unit
+  def unregisterTable(tableIdent: TableIdentifier): Unit
 
   def unregisterAllTables(): Unit
 
-  // TODO: Refactor it in the work of SPARK-10104
-  protected def processTableIdentifier(tableIdentifier: Seq[String]): Seq[String] = {
-    if (conf.caseSensitiveAnalysis) {
-      tableIdentifier
-    } else {
-      tableIdentifier.map(_.toLowerCase)
-    }
-  }
-
-  // TODO: Refactor it in the work of SPARK-10104
-  protected def getDbTableName(tableIdent: Seq[String]): String = {
-    val size = tableIdent.size
-    if (size <= 2) {
-      tableIdent.mkString(".")
-    } else {
-      tableIdent.slice(size - 2, size).mkString(".")
-    }
-  }
-
-  // TODO: Refactor it in the work of SPARK-10104
-  protected def getDBTable(tableIdent: Seq[String]) : (Option[String], String) = {
-    (tableIdent.lift(tableIdent.size - 2), tableIdent.last)
-  }
-
   /**
-   * It is not allowed to specifiy database name for tables stored in [[SimpleCatalog]].
-   * We use this method to check it.
+   * Get the table name of TableIdentifier for temporary tables.
    */
-  protected def checkTableIdentifier(tableIdentifier: Seq[String]): Unit = {
-    if (tableIdentifier.length > 1) {
+  protected def getTableName(tableIdent: TableIdentifier): String = {
+    // It is not allowed to specify database name for temporary tables.
+    // We check it here and throw exception if database is defined.
+    if (tableIdent.database.isDefined) {
       throw new AnalysisException("Specifying database name or other qualifiers are not allowed " +
         "for temporary tables. If the table name has dots (.) in it, please quote the " +
         "table name with backticks (`).")
     }
+    if (conf.caseSensitiveAnalysis) {
+      tableIdent.table
+    } else {
+      tableIdent.table.toLowerCase
+    }
   }
 }
 
 class SimpleCatalog(val conf: CatalystConf) extends Catalog {
-  val tables = new ConcurrentHashMap[String, LogicalPlan]
-
-  override def registerTable(
-      tableIdentifier: Seq[String],
-      plan: LogicalPlan): Unit = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    tables.put(getDbTableName(tableIdent), plan)
+  private[this] val tables = new ConcurrentHashMap[String, LogicalPlan]
+
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
+    tables.put(getTableName(tableIdent), plan)
   }
 
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    tables.remove(getDbTableName(tableIdent))
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
+    tables.remove(getTableName(tableIdent))
   }
 
   override def unregisterAllTables(): Unit = {
     tables.clear()
   }
 
-  override def tableExists(tableIdentifier: Seq[String]): Boolean = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    tables.containsKey(getDbTableName(tableIdent))
+  override def tableExists(tableIdent: TableIdentifier): Boolean = {
+    tables.containsKey(getTableName(tableIdent))
   }
 
   override def lookupRelation(
-      tableIdentifier: Seq[String],
+      tableIdent: TableIdentifier,
       alias: Option[String] = None): LogicalPlan = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    val tableFullName = getDbTableName(tableIdent)
-    val table = tables.get(tableFullName)
+    val tableName = getTableName(tableIdent)
+    val table = tables.get(tableName)
     if (table == null) {
-      sys.error(s"Table Not Found: $tableFullName")
+      throw new NoSuchTableException
     }
-    val tableWithQualifiers = Subquery(tableIdent.last, table)
+    val tableWithQualifiers = Subquery(tableName, table)
 
     // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
     // properly qualified with this alias.
@@ -146,11 +114,7 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
   }
 
   override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
-    val result = ArrayBuffer.empty[(String, Boolean)]
-    for (name <- tables.keySet().asScala) {
-      result += ((name, true))
-    }
-    result
+    tables.keySet().asScala.map(_ -> true).toSeq
   }
 
   override def refreshTable(tableIdent: TableIdentifier): Unit = {
@@ -165,68 +129,50 @@ class SimpleCatalog(val conf: CatalystConf) extends Catalog {
  * lost when the JVM exits.
  */
 trait OverrideCatalog extends Catalog {
+  private[this] val overrides = new ConcurrentHashMap[String, LogicalPlan]
 
-  // TODO: This doesn't work when the database changes...
-  val overrides = new mutable.HashMap[(Option[String], String), LogicalPlan]()
-
-  abstract override def tableExists(tableIdentifier: Seq[String]): Boolean = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    // A temporary tables only has a single part in the tableIdentifier.
-    val overriddenTable = if (tableIdentifier.length > 1) {
-      None: Option[LogicalPlan]
+  private def getOverriddenTable(tableIdent: TableIdentifier): Option[LogicalPlan] = {
+    if (tableIdent.database.isDefined) {
+      None
     } else {
-      overrides.get(getDBTable(tableIdent))
+      Option(overrides.get(getTableName(tableIdent)))
     }
-    overriddenTable match {
+  }
+
+  abstract override def tableExists(tableIdent: TableIdentifier): Boolean = {
+    getOverriddenTable(tableIdent) match {
       case Some(_) => true
-      case None => super.tableExists(tableIdentifier)
+      case None => super.tableExists(tableIdent)
     }
   }
 
   abstract override def lookupRelation(
-      tableIdentifier: Seq[String],
+      tableIdent: TableIdentifier,
       alias: Option[String] = None): LogicalPlan = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    // A temporary tables only has a single part in the tableIdentifier.
-    val overriddenTable = if (tableIdentifier.length > 1) {
-      None: Option[LogicalPlan]
-    } else {
-      overrides.get(getDBTable(tableIdent))
-    }
-    val tableWithQualifers = overriddenTable.map(r => Subquery(tableIdent.last, r))
+    getOverriddenTable(tableIdent) match {
+      case Some(table) =>
+        val tableName = getTableName(tableIdent)
+        val tableWithQualifiers = Subquery(tableName, table)
 
-    // If an alias was specified by the lookup, wrap the plan in a subquery so that attributes are
-    // properly qualified with this alias.
-    val withAlias =
-      tableWithQualifers.map(r => alias.map(a => Subquery(a, r)).getOrElse(r))
+        // If an alias was specified by the lookup, wrap the plan in a sub-query so that attributes
+        // are properly qualified with this alias.
+        alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers)
 
-    withAlias.getOrElse(super.lookupRelation(tableIdentifier, alias))
+      case None => super.lookupRelation(tableIdent, alias)
+    }
   }
 
   abstract override def getTables(databaseName: Option[String]): Seq[(String, Boolean)] = {
-    // We always return all temporary tables.
-    val temporaryTables = overrides.map {
-      case ((_, tableName), _) => (tableName, true)
-    }.toSeq
-
-    temporaryTables ++ super.getTables(databaseName)
+    overrides.keySet().asScala.map(_ -> true).toSeq ++ super.getTables(databaseName)
   }
 
-  override def registerTable(
-      tableIdentifier: Seq[String],
-      plan: LogicalPlan): Unit = {
-    checkTableIdentifier(tableIdentifier)
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    overrides.put(getDBTable(tableIdent), plan)
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
+    overrides.put(getTableName(tableIdent), plan)
   }
 
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
-    // A temporary tables only has a single part in the tableIdentifier.
-    // If tableIdentifier has more than one parts, it is not a temporary table
-    // and we do not need to do anything at here.
-    if (tableIdentifier.length == 1) {
-      val tableIdent = processTableIdentifier(tableIdentifier)
-      overrides.remove(getDBTable(tableIdent))
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
+    if (tableIdent.database.isEmpty) {
+      overrides.remove(getTableName(tableIdent))
     }
   }
 
@@ -243,12 +189,12 @@ object EmptyCatalog extends Catalog {
 
   override val conf: CatalystConf = EmptyConf
 
-  override def tableExists(tableIdentifier: Seq[String]): Boolean = {
+  override def tableExists(tableIdent: TableIdentifier): Boolean = {
     throw new UnsupportedOperationException
   }
 
   override def lookupRelation(
-      tableIdentifier: Seq[String],
+      tableIdent: TableIdentifier,
       alias: Option[String] = None): LogicalPlan = {
     throw new UnsupportedOperationException
   }
@@ -257,15 +203,17 @@ object EmptyCatalog extends Catalog {
     throw new UnsupportedOperationException
   }
 
-  override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = {
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
     throw new UnsupportedOperationException
   }
 
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
   }
 
-  override def unregisterAllTables(): Unit = {}
+  override def unregisterAllTables(): Unit = {
+    throw new UnsupportedOperationException
+  }
 
   override def refreshTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
index 43ee3191935eb..c97365003935e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.catalyst.errors
+import org.apache.spark.sql.catalyst.{TableIdentifier, errors}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LeafNode
 import org.apache.spark.sql.catalyst.trees.TreeNode
@@ -36,11 +36,11 @@ class UnresolvedException[TreeType <: TreeNode[_]](tree: TreeType, function: Str
  * Holds the name of a relation that has yet to be looked up in a [[Catalog]].
  */
 case class UnresolvedRelation(
-    tableIdentifier: Seq[String],
+    tableIdentifier: TableIdentifier,
     alias: Option[String] = None) extends LeafNode {
 
   /** Returns a `.` separated name for this relation. */
-  def tableName: String = tableIdentifier.mkString(".")
+  def tableName: String = tableIdentifier.unquotedString
 
   override def output: Seq[Attribute] = Nil
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 699c4cc63d09a..27b3cd84b3846 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -286,7 +286,8 @@ package object dsl {
 
       def insertInto(tableName: String, overwrite: Boolean = false): LogicalPlan =
         InsertIntoTable(
-          analysis.UnresolvedRelation(Seq(tableName)), Map.empty, logicalPlan, overwrite, false)
+          analysis.UnresolvedRelation(TableIdentifier(tableName)),
+          Map.empty, logicalPlan, overwrite, false)
 
       def analyze: LogicalPlan = EliminateSubQueries(analysis.SimpleAnalyzer.execute(logicalPlan))
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 820b336aac759..ec05cfa63c5bf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.expressions._
@@ -53,32 +54,39 @@ class AnalysisSuite extends AnalysisTest {
       Project(testRelation.output, testRelation))
 
     checkAnalysis(
-      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("TbL.a")),
+        UnresolvedRelation(TableIdentifier("TaBlE"), Some("TbL"))),
       Project(testRelation.output, testRelation))
 
     assertAnalysisError(
-      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(
+        TableIdentifier("TaBlE"), Some("TbL"))),
       Seq("cannot resolve"))
 
     checkAnalysis(
-      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("TbL.a")), UnresolvedRelation(
+        TableIdentifier("TaBlE"), Some("TbL"))),
       Project(testRelation.output, testRelation),
       caseSensitive = false)
 
     checkAnalysis(
-      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(Seq("TaBlE"), Some("TbL"))),
+      Project(Seq(UnresolvedAttribute("tBl.a")), UnresolvedRelation(
+        TableIdentifier("TaBlE"), Some("TbL"))),
       Project(testRelation.output, testRelation),
       caseSensitive = false)
   }
 
   test("resolve relations") {
-    assertAnalysisError(UnresolvedRelation(Seq("tAbLe"), None), Seq("Table Not Found: tAbLe"))
+    assertAnalysisError(
+      UnresolvedRelation(TableIdentifier("tAbLe"), None), Seq("Table Not Found: tAbLe"))
 
-    checkAnalysis(UnresolvedRelation(Seq("TaBlE"), None), testRelation)
+    checkAnalysis(UnresolvedRelation(TableIdentifier("TaBlE"), None), testRelation)
 
-    checkAnalysis(UnresolvedRelation(Seq("tAbLe"), None), testRelation, caseSensitive = false)
+    checkAnalysis(
+      UnresolvedRelation(TableIdentifier("tAbLe"), None), testRelation, caseSensitive = false)
 
-    checkAnalysis(UnresolvedRelation(Seq("TaBlE"), None), testRelation, caseSensitive = false)
+    checkAnalysis(
+      UnresolvedRelation(TableIdentifier("TaBlE"), None), testRelation, caseSensitive = false)
   }
 
   test("divide should be casted into fractional types") {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
index 53b3695a86be5..23861ed15da61 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisTest.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
-import org.apache.spark.sql.catalyst.SimpleCatalystConf
+import org.apache.spark.sql.catalyst.{TableIdentifier, SimpleCatalystConf}
 
 trait AnalysisTest extends PlanTest {
 
@@ -30,8 +31,8 @@ trait AnalysisTest extends PlanTest {
     val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf)
     val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf)
 
-    caseSensitiveCatalog.registerTable(Seq("TaBlE"), TestRelations.testRelation)
-    caseInsensitiveCatalog.registerTable(Seq("TaBlE"), TestRelations.testRelation)
+    caseSensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation)
+    caseInsensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation)
 
     new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) {
       override val extendedResolutionRules = EliminateSubQueries :: Nil
@@ -67,8 +68,7 @@ trait AnalysisTest extends PlanTest {
       expectedErrors: Seq[String],
       caseSensitive: Boolean = true): Unit = {
     val analyzer = getAnalyzer(caseSensitive)
-    // todo: make sure we throw AnalysisException during analysis
-    val e = intercept[Exception] {
+    val e = intercept[AnalysisException] {
       analyzer.checkAnalysis(analyzer.execute(inputPlan))
     }
     assert(expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains),
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
index b4ad618c23e39..40c4ae7920918 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Union, Project, LocalRelation}
 import org.apache.spark.sql.types._
-import org.apache.spark.sql.catalyst.SimpleCatalystConf
+import org.apache.spark.sql.catalyst.{TableIdentifier, SimpleCatalystConf}
 
 class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
   val conf = new SimpleCatalystConf(true)
@@ -47,7 +47,7 @@ class DecimalPrecisionSuite extends SparkFunSuite with BeforeAndAfter {
   val b: Expression = UnresolvedAttribute("b")
 
   before {
-    catalog.registerTable(Seq("table"), relation)
+    catalog.registerTable(TableIdentifier("table"), relation)
   }
 
   private def checkType(expression: Expression, expectedType: DataType): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 97a8b6518a832..eacdea2c1e5b3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -33,6 +33,7 @@ import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.execution.datasources.{LogicalRelation, ResolvedDataSource}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.{Logging, Partition}
+import org.apache.spark.sql.catalyst.{SqlParser, TableIdentifier}
 
 /**
  * :: Experimental ::
@@ -287,7 +288,7 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
    * @since 1.4.0
    */
   def table(tableName: String): DataFrame = {
-    DataFrame(sqlContext, sqlContext.catalog.lookupRelation(Seq(tableName)))
+    DataFrame(sqlContext, sqlContext.catalog.lookupRelation(TableIdentifier(tableName)))
   }
 
   ///////////////////////////////////////////////////////////////////////////////////////
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
index 03e973666e888..764510ab4b4bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -171,7 +171,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
     val overwrite = mode == SaveMode.Overwrite
     df.sqlContext.executePlan(
       InsertIntoTable(
-        UnresolvedRelation(tableIdent.toSeq),
+        UnresolvedRelation(tableIdent),
         partitions.getOrElse(Map.empty[String, Option[String]]),
         df.logicalPlan,
         overwrite,
@@ -201,7 +201,7 @@ final class DataFrameWriter private[sql](df: DataFrame) {
   }
 
   private def saveAsTable(tableIdent: TableIdentifier): Unit = {
-    val tableExists = df.sqlContext.catalog.tableExists(tableIdent.toSeq)
+    val tableExists = df.sqlContext.catalog.tableExists(tableIdent)
 
     (tableExists, mode) match {
       case (true, SaveMode.Ignore) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 3d5e35ab315eb..361eb576c567a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -714,7 +714,7 @@ class SQLContext private[sql](
    * only during the lifetime of this instance of SQLContext.
    */
   private[sql] def registerDataFrameAsTable(df: DataFrame, tableName: String): Unit = {
-    catalog.registerTable(Seq(tableName), df.logicalPlan)
+    catalog.registerTable(TableIdentifier(tableName), df.logicalPlan)
   }
 
   /**
@@ -728,7 +728,7 @@ class SQLContext private[sql](
    */
   def dropTempTable(tableName: String): Unit = {
     cacheManager.tryUncacheQuery(table(tableName))
-    catalog.unregisterTable(Seq(tableName))
+    catalog.unregisterTable(TableIdentifier(tableName))
   }
 
   /**
@@ -795,7 +795,7 @@ class SQLContext private[sql](
   }
 
   private def table(tableIdent: TableIdentifier): DataFrame = {
-    DataFrame(this, catalog.lookupRelation(tableIdent.toSeq))
+    DataFrame(this, catalog.lookupRelation(tableIdent))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
index f7a88b98c0b48..446739d5b8a2c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DDLParser.scala
@@ -140,7 +140,7 @@ class DDLParser(parseQuery: String => LogicalPlan)
   protected lazy val describeTable: Parser[LogicalPlan] =
     (DESCRIBE ~> opt(EXTENDED)) ~ tableIdentifier ^^ {
       case e ~ tableIdent =>
-        DescribeCommand(UnresolvedRelation(tableIdent.toSeq, None), e.isDefined)
+        DescribeCommand(UnresolvedRelation(tableIdent, None), e.isDefined)
     }
 
   protected lazy val refreshTable: Parser[LogicalPlan] =
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
index 31d6b75e13477..e7deeff13dc4d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ddl.scala
@@ -71,7 +71,6 @@ case class CreateTableUsing(
  * can analyze the logical plan that will be used to populate the table.
  * So, [[PreWriteCheck]] can detect cases that are not allowed.
  */
-// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 case class CreateTableUsingAsSelect(
     tableIdent: TableIdentifier,
     provider: String,
@@ -93,7 +92,7 @@ case class CreateTempTableUsing(
     val resolved = ResolvedDataSource(
       sqlContext, userSpecifiedSchema, Array.empty[String], provider, options)
     sqlContext.catalog.registerTable(
-      tableIdent.toSeq,
+      tableIdent,
       DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan)
 
     Seq.empty[Row]
@@ -112,7 +111,7 @@ case class CreateTempTableUsingAsSelect(
     val df = DataFrame(sqlContext, query)
     val resolved = ResolvedDataSource(sqlContext, provider, partitionColumns, mode, options, df)
     sqlContext.catalog.registerTable(
-      tableIdent.toSeq,
+      tableIdent,
       DataFrame(sqlContext, LogicalRelation(resolved.relation)).logicalPlan)
 
     Seq.empty[Row]
@@ -128,7 +127,7 @@ case class RefreshTable(tableIdent: TableIdentifier)
 
     // If this table is cached as a InMemoryColumnarRelation, drop the original
     // cached version and make the new version cached lazily.
-    val logicalPlan = sqlContext.catalog.lookupRelation(tableIdent.toSeq)
+    val logicalPlan = sqlContext.catalog.lookupRelation(tableIdent)
     // Use lookupCachedData directly since RefreshTable also takes databaseName.
     val isCached = sqlContext.cacheManager.lookupCachedData(logicalPlan).nonEmpty
     if (isCached) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index 8efc8016f94dd..b00e5680fef9e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -143,9 +143,9 @@ private[sql] case class PreWriteCheck(catalog: Catalog) extends (LogicalPlan =>
       case CreateTableUsingAsSelect(tableIdent, _, _, partitionColumns, mode, _, query) =>
         // When the SaveMode is Overwrite, we need to check if the table is an input table of
         // the query. If so, we will throw an AnalysisException to let users know it is not allowed.
-        if (mode == SaveMode.Overwrite && catalog.tableExists(tableIdent.toSeq)) {
+        if (mode == SaveMode.Overwrite && catalog.tableExists(tableIdent)) {
           // Need to remove SubQuery operator.
-          EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq)) match {
+          EliminateSubQueries(catalog.lookupRelation(tableIdent)) match {
             // Only do the check if the table is a data source table
             // (the relation is a BaseRelation).
             case l @ LogicalRelation(dest: BaseRelation, _) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
index 356d4ff3fa837..fd566c8276bc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
 import org.apache.spark.sql.execution.PhysicalRDD
 
 import scala.concurrent.duration._
@@ -287,8 +288,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
     testData.select('key).registerTempTable("t1")
     sqlContext.table("t1")
     sqlContext.dropTempTable("t1")
-    assert(
-      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
+    intercept[NoSuchTableException](sqlContext.table("t1"))
   }
 
   test("Drops cached temporary table") {
@@ -300,8 +300,7 @@ class CachedTableSuite extends QueryTest with SharedSQLContext {
     assert(sqlContext.isCached("t2"))
 
     sqlContext.dropTempTable("t1")
-    assert(
-      intercept[RuntimeException](sqlContext.table("t1")).getMessage.startsWith("Table Not Found"))
+    intercept[NoSuchTableException](sqlContext.table("t1"))
     assert(!sqlContext.isCached("t2"))
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
index 7a027e13089e3..b1fb06815868c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql
 
 import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.test.SharedSQLContext
 
@@ -359,8 +360,8 @@ class JoinSuite extends QueryTest with SharedSQLContext {
     upperCaseData.where('N <= 4).registerTempTable("left")
     upperCaseData.where('N >= 3).registerTempTable("right")
 
-    val left = UnresolvedRelation(Seq("left"), None)
-    val right = UnresolvedRelation(Seq("right"), None)
+    val left = UnresolvedRelation(TableIdentifier("left"), None)
+    val right = UnresolvedRelation(TableIdentifier("right"), None)
 
     checkAnswer(
       left.join(right, $"left.N" === $"right.N", "full"),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
index eab0fbb196eb6..5688f46e5e3d4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ListTablesSuite.scala
@@ -21,6 +21,7 @@ import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.sql.test.SharedSQLContext
 import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
+import org.apache.spark.sql.catalyst.TableIdentifier
 
 class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContext {
   import testImplicits._
@@ -32,7 +33,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
   }
 
   after {
-    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
   }
 
   test("get all tables") {
@@ -44,7 +45,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
       sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
     assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
@@ -57,7 +58,7 @@ class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContex
       sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
       Row("ListTablesSuiteTable", true))
 
-    sqlContext.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
     assert(sqlContext.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
index cc02ef81c9f8b..baff7f5752a75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala
@@ -22,7 +22,7 @@ import java.io.File
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.{TableIdentifier, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.SpecificMutableRow
 import org.apache.spark.sql.execution.datasources.parquet.TestingUDT.{NestedStruct, NestedStructUDT}
 import org.apache.spark.sql.test.SharedSQLContext
@@ -49,7 +49,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
       checkAnswer(sqlContext.table("t"), (data ++ data).map(Row.fromTuple))
     }
-    sqlContext.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("overwriting") {
@@ -59,7 +59,7 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
       checkAnswer(sqlContext.table("t"), data.map(Row.fromTuple))
     }
-    sqlContext.catalog.unregisterTable(Seq("tmp"))
+    sqlContext.catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("self-join") {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index e620d7fb82af9..4d8a3f728e6b5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -358,7 +358,7 @@ class HiveContext private[hive](
   @Experimental
   def analyze(tableName: String) {
     val tableIdent = SqlParser.parseTableIdentifier(tableName)
-    val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent.toSeq))
+    val relation = EliminateSubQueries(catalog.lookupRelation(tableIdent))
 
     relation match {
       case relation: MetastoreRelation =>
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 1f8223e1ff507..5819cb9d08778 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
-import org.apache.spark.sql.catalyst.{InternalRow, SqlParser, TableIdentifier}
+import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation, Partition => ParquetPartition, PartitionSpec, ResolvedDataSource}
 import org.apache.spark.sql.execution.{FileRelation, datasources}
@@ -103,10 +103,19 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   /** Usages should lock on `this`. */
   protected[hive] lazy val hiveWarehouse = new Warehouse(hive.hiveconf)
 
-  // TODO: Use this everywhere instead of tuples or databaseName, tableName,.
   /** A fully qualified identifier for a table (i.e., database.tableName) */
-  case class QualifiedTableName(database: String, name: String) {
-    def toLowerCase: QualifiedTableName = QualifiedTableName(database.toLowerCase, name.toLowerCase)
+  case class QualifiedTableName(database: String, name: String)
+
+  private def getQualifiedTableName(tableIdent: TableIdentifier) = {
+    QualifiedTableName(
+      tableIdent.database.getOrElse(client.currentDatabase).toLowerCase,
+      tableIdent.table.toLowerCase)
+  }
+
+  private def getQualifiedTableName(hiveTable: HiveTable) = {
+    QualifiedTableName(
+      hiveTable.specifiedDatabase.getOrElse(client.currentDatabase).toLowerCase,
+      hiveTable.name.toLowerCase)
   }
 
   /** A cache of Spark SQL data source tables that have been accessed. */
@@ -179,33 +188,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
   }
 
   def invalidateTable(tableIdent: TableIdentifier): Unit = {
-    val databaseName = tableIdent.database.getOrElse(client.currentDatabase)
-    val tableName = tableIdent.table
-
-    cachedDataSourceTables.invalidate(QualifiedTableName(databaseName, tableName).toLowerCase)
-  }
-
-  val caseSensitive: Boolean = false
-
-  /**
-   * Creates a data source table (a table created with USING clause) in Hive's metastore.
-   * Returns true when the table has been created. Otherwise, false.
-   */
-  // TODO: Remove this in SPARK-10104.
-  def createDataSourceTable(
-      tableName: String,
-      userSpecifiedSchema: Option[StructType],
-      partitionColumns: Array[String],
-      provider: String,
-      options: Map[String, String],
-      isExternal: Boolean): Unit = {
-    createDataSourceTable(
-      SqlParser.parseTableIdentifier(tableName),
-      userSpecifiedSchema,
-      partitionColumns,
-      provider,
-      options,
-      isExternal)
+    cachedDataSourceTables.invalidate(getQualifiedTableName(tableIdent))
   }
 
   def createDataSourceTable(
@@ -215,10 +198,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
       provider: String,
       options: Map[String, String],
       isExternal: Boolean): Unit = {
-    val (dbName, tblName) = {
-      val database = tableIdent.database.getOrElse(client.currentDatabase)
-      processDatabaseAndTableName(database, tableIdent.table)
-    }
+    val QualifiedTableName(dbName, tblName) = getQualifiedTableName(tableIdent)
 
     val tableProperties = new mutable.HashMap[String, String]
     tableProperties.put("spark.sql.sources.provider", provider)
@@ -311,7 +291,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
 
     // TODO: Support persisting partitioned data source relations in Hive compatible format
     val qualifiedTableName = tableIdent.quotedString
-    val (hiveCompitiableTable, logMessage) = (maybeSerDe, dataSource.relation) match {
+    val (hiveCompatibleTable, logMessage) = (maybeSerDe, dataSource.relation) match {
       case (Some(serde), relation: HadoopFsRelation)
         if relation.paths.length == 1 && relation.partitionColumns.isEmpty =>
         val hiveTable = newHiveCompatibleMetastoreTable(relation, serde)
@@ -349,9 +329,9 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         (None, message)
     }
 
-    (hiveCompitiableTable, logMessage) match {
+    (hiveCompatibleTable, logMessage) match {
       case (Some(table), message) =>
-        // We first try to save the metadata of the table in a Hive compatiable way.
+        // We first try to save the metadata of the table in a Hive compatible way.
         // If Hive throws an error, we fall back to save its metadata in the Spark SQL
         // specific way.
         try {
@@ -374,48 +354,29 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     }
   }
 
-  def hiveDefaultTableFilePath(tableName: String): String = {
-    hiveDefaultTableFilePath(SqlParser.parseTableIdentifier(tableName))
-  }
-
   def hiveDefaultTableFilePath(tableIdent: TableIdentifier): String = {
     // Code based on: hiveWarehouse.getTablePath(currentDatabase, tableName)
-    val database = tableIdent.database.getOrElse(client.currentDatabase)
-
-    new Path(
-      new Path(client.getDatabase(database).location),
-      tableIdent.table.toLowerCase).toString
+    val QualifiedTableName(dbName, tblName) = getQualifiedTableName(tableIdent)
+    new Path(new Path(client.getDatabase(dbName).location), tblName).toString
   }
 
-  def tableExists(tableIdentifier: Seq[String]): Boolean = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    val databaseName =
-      tableIdent
-        .lift(tableIdent.size - 2)
-        .getOrElse(client.currentDatabase)
-    val tblName = tableIdent.last
-    client.getTableOption(databaseName, tblName).isDefined
+  override def tableExists(tableIdent: TableIdentifier): Boolean = {
+    val QualifiedTableName(dbName, tblName) = getQualifiedTableName(tableIdent)
+    client.getTableOption(dbName, tblName).isDefined
   }
 
-  def lookupRelation(
-      tableIdentifier: Seq[String],
+  override def lookupRelation(
+      tableIdent: TableIdentifier,
       alias: Option[String]): LogicalPlan = {
-    val tableIdent = processTableIdentifier(tableIdentifier)
-    val databaseName = tableIdent.lift(tableIdent.size - 2).getOrElse(
-      client.currentDatabase)
-    val tblName = tableIdent.last
-    val table = client.getTable(databaseName, tblName)
+    val qualifiedTableName = getQualifiedTableName(tableIdent)
+    val table = client.getTable(qualifiedTableName.database, qualifiedTableName.name)
 
     if (table.properties.get("spark.sql.sources.provider").isDefined) {
-      val dataSourceTable =
-        cachedDataSourceTables(QualifiedTableName(databaseName, tblName).toLowerCase)
+      val dataSourceTable = cachedDataSourceTables(qualifiedTableName)
+      val tableWithQualifiers = Subquery(qualifiedTableName.name, dataSourceTable)
       // Then, if alias is specified, wrap the table with a Subquery using the alias.
       // Otherwise, wrap the table with a Subquery using the table name.
-      val withAlias =
-        alias.map(a => Subquery(a, dataSourceTable)).getOrElse(
-          Subquery(tableIdent.last, dataSourceTable))
-
-      withAlias
+      alias.map(a => Subquery(a, tableWithQualifiers)).getOrElse(tableWithQualifiers)
     } else if (table.tableType == VirtualView) {
       val viewText = table.viewText.getOrElse(sys.error("Invalid view without text."))
       alias match {
@@ -425,7 +386,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
         case Some(aliasText) => Subquery(aliasText, HiveQl.createPlan(viewText))
       }
     } else {
-      MetastoreRelation(databaseName, tblName, alias)(table)(hive)
+      MetastoreRelation(qualifiedTableName.database, qualifiedTableName.name, alias)(table)(hive)
     }
   }
 
@@ -524,26 +485,6 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
     client.listTables(db).map(tableName => (tableName, false))
   }
 
-  protected def processDatabaseAndTableName(
-      databaseName: Option[String],
-      tableName: String): (Option[String], String) = {
-    if (!caseSensitive) {
-      (databaseName.map(_.toLowerCase), tableName.toLowerCase)
-    } else {
-      (databaseName, tableName)
-    }
-  }
-
-  protected def processDatabaseAndTableName(
-      databaseName: String,
-      tableName: String): (String, String) = {
-    if (!caseSensitive) {
-      (databaseName.toLowerCase, tableName.toLowerCase)
-    } else {
-      (databaseName, tableName)
-    }
-  }
-
   /**
    * When scanning or writing to non-partitioned Metastore Parquet tables, convert them to Parquet
    * data source relations for better performance.
@@ -597,8 +538,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
               "It is not allowed to define a view with both IF NOT EXISTS and OR REPLACE.")
           }
 
-          val (dbName, tblName) = processDatabaseAndTableName(
-            table.specifiedDatabase.getOrElse(client.currentDatabase), table.name)
+          val QualifiedTableName(dbName, tblName) = getQualifiedTableName(table)
 
           execution.CreateViewAsSelect(
             table.copy(
@@ -636,7 +576,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
           val mode = if (allowExisting) SaveMode.Ignore else SaveMode.ErrorIfExists
           CreateTableUsingAsSelect(
             TableIdentifier(desc.name),
-            hive.conf.defaultDataSourceName,
+            conf.defaultDataSourceName,
             temporary = false,
             Array.empty[String],
             mode,
@@ -652,9 +592,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
             table
           }
 
-          val (dbName, tblName) =
-            processDatabaseAndTableName(
-              desc.specifiedDatabase.getOrElse(client.currentDatabase), desc.name)
+          val QualifiedTableName(dbName, tblName) = getQualifiedTableName(table)
 
           execution.CreateTableAsSelect(
             desc.copy(
@@ -712,7 +650,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def registerTable(tableIdentifier: Seq[String], plan: LogicalPlan): Unit = {
+  override def registerTable(tableIdent: TableIdentifier, plan: LogicalPlan): Unit = {
     throw new UnsupportedOperationException
   }
 
@@ -720,7 +658,7 @@ private[hive] class HiveMetastoreCatalog(val client: ClientInterface, hive: Hive
    * UNIMPLEMENTED: It needs to be decided how we will persist in-memory tables to the metastore.
    * For now, if this functionality is desired mix in the in-memory [[OverrideCatalog]].
    */
-  override def unregisterTable(tableIdentifier: Seq[String]): Unit = {
+  override def unregisterTable(tableIdent: TableIdentifier): Unit = {
     throw new UnsupportedOperationException
   }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 1d505019400bc..d4ff5cc0f12a2 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.{logical, _}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.CurrentOrigin
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.ExplainCommand
 import org.apache.spark.sql.execution.datasources.DescribeCommand
 import org.apache.spark.sql.hive.HiveShim._
@@ -442,24 +443,12 @@ private[hive] object HiveQl extends Logging {
       throw new NotImplementedError(s"No parse rules for StructField:\n ${dumpTree(a).toString} ")
   }
 
-  protected def extractDbNameTableName(tableNameParts: Node): (Option[String], String) = {
-    val (db, tableName) =
-      tableNameParts.getChildren.asScala.map {
-        case Token(part, Nil) => cleanIdentifier(part)
-      } match {
-        case Seq(tableOnly) => (None, tableOnly)
-        case Seq(databaseName, table) => (Some(databaseName), table)
-      }
-
-    (db, tableName)
-  }
-
-  protected def extractTableIdent(tableNameParts: Node): Seq[String] = {
+  protected def extractTableIdent(tableNameParts: Node): TableIdentifier = {
     tableNameParts.getChildren.asScala.map {
       case Token(part, Nil) => cleanIdentifier(part)
     } match {
-      case Seq(tableOnly) => Seq(tableOnly)
-      case Seq(databaseName, table) => Seq(databaseName, table)
+      case Seq(tableOnly) => TableIdentifier(tableOnly)
+      case Seq(databaseName, table) => TableIdentifier(table, Some(databaseName))
       case other => sys.error("Hive only supports tables names like 'tableName' " +
         s"or 'databaseName.tableName', found '$other'")
     }
@@ -518,13 +507,13 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
       properties: Map[String, String],
       allowExist: Boolean,
       replace: Boolean): CreateViewAsSelect = {
-    val (db, viewName) = extractDbNameTableName(viewNameParts)
+    val TableIdentifier(viewName, dbName) = extractTableIdent(viewNameParts)
 
     val originalText = context.getTokenRewriteStream
       .toString(query.getTokenStartIndex, query.getTokenStopIndex)
 
     val tableDesc = HiveTable(
-      specifiedDatabase = db,
+      specifiedDatabase = dbName,
       name = viewName,
       schema = schema,
       partitionColumns = Seq.empty[HiveColumn],
@@ -611,7 +600,8 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
               case tableName =>
                 // It is describing a table with the format like "describe table".
                 DescribeCommand(
-                  UnresolvedRelation(Seq(tableName.getText), None), isExtended = extended.isDefined)
+                  UnresolvedRelation(TableIdentifier(tableName.getText), None),
+                  isExtended = extended.isDefined)
             }
           }
           // All other cases.
@@ -716,12 +706,12 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
             "TOK_TABLELOCATION",
             "TOK_TABLEPROPERTIES"),
           children)
-      val (db, tableName) = extractDbNameTableName(tableNameParts)
+      val TableIdentifier(tblName, dbName) = extractTableIdent(tableNameParts)
 
       // TODO add bucket support
       var tableDesc: HiveTable = HiveTable(
-        specifiedDatabase = db,
-        name = tableName,
+        specifiedDatabase = dbName,
+        name = tblName,
         schema = Seq.empty[HiveColumn],
         partitionColumns = Seq.empty[HiveColumn],
         properties = Map[String, String](),
@@ -1264,15 +1254,7 @@ https://cwiki.apache.org/confluence/display/Hive/Enhanced+Aggregation%2C+Cube%2C
           nonAliasClauses)
       }
 
-      val tableIdent =
-        tableNameParts.getChildren.asScala.map {
-          case Token(part, Nil) => cleanIdentifier(part)
-        } match {
-          case Seq(tableOnly) => Seq(tableOnly)
-          case Seq(databaseName, table) => Seq(databaseName, table)
-          case other => sys.error("Hive only supports tables names like 'tableName' " +
-            s"or 'databaseName.tableName', found '$other'")
-      }
+      val tableIdent = extractTableIdent(tableNameParts)
       val alias = aliasClause.map { case Token(a, Nil) => cleanIdentifier(a) }
       val relation = UnresolvedRelation(tableIdent, alias)
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
index 8422287e177e5..e72a60b42e653 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateTableAsSelect.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
 import org.apache.spark.sql.execution.RunnableCommand
 import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}
@@ -37,8 +38,7 @@ case class CreateTableAsSelect(
     allowExisting: Boolean)
   extends RunnableCommand {
 
-  def database: String = tableDesc.database
-  def tableName: String = tableDesc.name
+  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))
 
   override def children: Seq[LogicalPlan] = Seq(query)
 
@@ -72,18 +72,18 @@ case class CreateTableAsSelect(
       hiveContext.catalog.client.createTable(withSchema)
 
       // Get the Metastore Relation
-      hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match {
+      hiveContext.catalog.lookupRelation(tableIdentifier, None) match {
         case r: MetastoreRelation => r
       }
     }
     // TODO ideally, we should get the output data ready first and then
     // add the relation into catalog, just in case of failure occurs while data
     // processing.
-    if (hiveContext.catalog.tableExists(Seq(database, tableName))) {
+    if (hiveContext.catalog.tableExists(tableIdentifier)) {
       if (allowExisting) {
         // table already exists, will do nothing, to keep consistent with Hive
       } else {
-        throw new AnalysisException(s"$database.$tableName already exists.")
+        throw new AnalysisException(s"$tableIdentifier already exists.")
       }
     } else {
       hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd
@@ -93,6 +93,6 @@ case class CreateTableAsSelect(
   }
 
   override def argString: String = {
-    s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]"
+    s"[Database:${tableDesc.database}}, TableName: ${tableDesc.name}, InsertIntoHiveTable]"
   }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
index 2b504ac974f07..2c81115ee4fed 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateViewAsSelect.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.hive.execution
 
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext}
 import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
@@ -38,18 +39,18 @@ private[hive] case class CreateViewAsSelect(
   assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length)
   assert(tableDesc.viewText.isDefined)
 
+  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))
+
   override def run(sqlContext: SQLContext): Seq[Row] = {
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
-    val database = tableDesc.database
-    val viewName = tableDesc.name
 
-    if (hiveContext.catalog.tableExists(Seq(database, viewName))) {
+    if (hiveContext.catalog.tableExists(tableIdentifier)) {
       if (allowExisting) {
         // view already exists, will do nothing, to keep consistent with Hive
       } else if (orReplace) {
         hiveContext.catalog.client.alertView(prepareTable())
       } else {
-        throw new AnalysisException(s"View $database.$viewName already exists. " +
+        throw new AnalysisException(s"View $tableIdentifier already exists. " +
           "If you want to update the view definition, please use ALTER VIEW AS or " +
           "CREATE OR REPLACE VIEW AS")
       }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
index 51ec92afd06ed..94210a5394f9b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/commands.scala
@@ -71,7 +71,7 @@ case class DropTable(
     }
     hiveContext.invalidateTable(tableName)
     hiveContext.runSqlHive(s"DROP TABLE $ifExistsClause$tableName")
-    hiveContext.catalog.unregisterTable(Seq(tableName))
+    hiveContext.catalog.unregisterTable(TableIdentifier(tableName))
     Seq.empty[Row]
   }
 }
@@ -103,7 +103,6 @@ case class AddFile(path: String) extends RunnableCommand {
   }
 }
 
-// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 private[hive]
 case class CreateMetastoreDataSource(
     tableIdent: TableIdentifier,
@@ -131,7 +130,7 @@ case class CreateMetastoreDataSource(
     val tableName = tableIdent.unquotedString
     val hiveContext = sqlContext.asInstanceOf[HiveContext]
 
-    if (hiveContext.catalog.tableExists(tableIdent.toSeq)) {
+    if (hiveContext.catalog.tableExists(tableIdent)) {
       if (allowExisting) {
         return Seq.empty[Row]
       } else {
@@ -160,7 +159,6 @@ case class CreateMetastoreDataSource(
   }
 }
 
-// TODO: Use TableIdentifier instead of String for tableName (SPARK-10104).
 private[hive]
 case class CreateMetastoreDataSourceAsSelect(
     tableIdent: TableIdentifier,
@@ -198,7 +196,7 @@ case class CreateMetastoreDataSourceAsSelect(
       }
 
     var existingSchema = None: Option[StructType]
-    if (sqlContext.catalog.tableExists(tableIdent.toSeq)) {
+    if (sqlContext.catalog.tableExists(tableIdent)) {
       // Check if we need to throw an exception or just return.
       mode match {
         case SaveMode.ErrorIfExists =>
@@ -215,7 +213,7 @@ case class CreateMetastoreDataSourceAsSelect(
           val resolved = ResolvedDataSource(
             sqlContext, Some(query.schema.asNullable), partitionColumns, provider, optionsWithPath)
           val createdRelation = LogicalRelation(resolved.relation)
-          EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent.toSeq)) match {
+          EliminateSubQueries(sqlContext.catalog.lookupRelation(tableIdent)) match {
             case l @ LogicalRelation(_: InsertableRelation | _: HadoopFsRelation, _) =>
               if (l.relation != createdRelation.relation) {
                 val errorDescription =
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
index ff39ccb7c1ea5..6883d305cbead 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -191,7 +191,7 @@ class TestHiveContext(sc: SparkContext) extends HiveContext(sc) {
       // Make sure any test tables referenced are loaded.
       val referencedTables =
         describedTables ++
-        logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.last }
+        logical.collect { case UnresolvedRelation(tableIdent, _) => tableIdent.table }
       val referencedTestTables = referencedTables.filter(testTables.contains)
       logDebug(s"Query references test tables: ${referencedTestTables.mkString(", ")}")
       referencedTestTables.foreach(loadTestTable)
diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
index c8d272794d10b..8c4af1b8eaf44 100644
--- a/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
+++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/JavaMetastoreDataSourcesSuite.java
@@ -26,7 +26,6 @@
 
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.spark.sql.SaveMode;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -41,6 +40,8 @@
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.apache.spark.sql.SaveMode;
+import org.apache.spark.sql.catalyst.TableIdentifier;
 import org.apache.spark.util.Utils;
 
 public class JavaMetastoreDataSourcesSuite {
@@ -71,7 +72,8 @@ public void setUp() throws IOException {
     if (path.exists()) {
       path.delete();
     }
-    hiveManagedPath = new Path(sqlContext.catalog().hiveDefaultTableFilePath("javaSavedTable"));
+    hiveManagedPath = new Path(sqlContext.catalog().hiveDefaultTableFilePath(
+      new TableIdentifier("javaSavedTable")));
     fs = hiveManagedPath.getFileSystem(sc.hadoopConfiguration());
     if (fs.exists(hiveManagedPath)){
       fs.delete(hiveManagedPath, true);
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
index 579631df772b5..183aca29cf98d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/ListTablesSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.Row
 
@@ -31,14 +32,14 @@ class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAft
 
   override def beforeAll(): Unit = {
     // The catalog in HiveContext is a case insensitive one.
-    catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan)
+    catalog.registerTable(TableIdentifier("ListTablesSuiteTable"), df.logicalPlan)
     sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)")
     sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB")
     sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)")
   }
 
   override def afterAll(): Unit = {
-    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
+    catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable"))
     sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable")
     sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable")
     sql("DROP DATABASE IF EXISTS ListTablesSuiteDB")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
index d3565380005a0..d2928876887bd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/MetastoreDataSourcesSuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.util.Utils
 
 /**
@@ -367,7 +368,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
            |)
          """.stripMargin)
 
-      val expectedPath = catalog.hiveDefaultTableFilePath("ctasJsonTable")
+      val expectedPath = catalog.hiveDefaultTableFilePath(TableIdentifier("ctasJsonTable"))
       val filesystemPath = new Path(expectedPath)
       val fs = filesystemPath.getFileSystem(sparkContext.hadoopConfiguration)
       if (fs.exists(filesystemPath)) fs.delete(filesystemPath, true)
@@ -472,7 +473,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
           // Drop table will also delete the data.
           sql("DROP TABLE savedJsonTable")
           intercept[IOException] {
-            read.json(catalog.hiveDefaultTableFilePath("savedJsonTable"))
+            read.json(catalog.hiveDefaultTableFilePath(TableIdentifier("savedJsonTable")))
           }
         }
 
@@ -703,7 +704,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
 
         // Manually create a metastore data source table.
         catalog.createDataSourceTable(
-          tableName = "wide_schema",
+          tableIdent = TableIdentifier("wide_schema"),
           userSpecifiedSchema = Some(schema),
           partitionColumns = Array.empty[String],
           provider = "json",
@@ -733,7 +734,7 @@ class MetastoreDataSourcesSuite extends QueryTest with SQLTestUtils with TestHiv
           "EXTERNAL" -> "FALSE"),
         tableType = ManagedTable,
         serdeProperties = Map(
-          "path" -> catalog.hiveDefaultTableFilePath(tableName)))
+          "path" -> catalog.hiveDefaultTableFilePath(TableIdentifier(tableName))))
 
       catalog.client.createTable(hiveTable)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
index 6a692d6fce562..9bb32f11b76bd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.{Row, SQLConf, QueryTest}
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.execution.joins._
 import org.apache.spark.sql.hive.execution._
 import org.apache.spark.sql.hive.test.TestHiveSingleton
@@ -68,7 +69,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton {
 
   test("analyze MetastoreRelations") {
     def queryTotalSize(tableName: String): BigInt =
-      hiveContext.catalog.lookupRelation(Seq(tableName)).statistics.sizeInBytes
+      hiveContext.catalog.lookupRelation(TableIdentifier(tableName)).statistics.sizeInBytes
 
     // Non-partitioned table
     sql("CREATE TABLE analyzeTable (key STRING, value STRING)").collect()
@@ -115,7 +116,7 @@ class StatisticsSuite extends QueryTest with TestHiveSingleton {
     intercept[UnsupportedOperationException] {
       hiveContext.analyze("tempTable")
     }
-    hiveContext.catalog.unregisterTable(Seq("tempTable"))
+    hiveContext.catalog.unregisterTable(TableIdentifier("tempTable"))
   }
 
   test("estimates the size of a test MetastoreRelation") {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index 6aa34605b05a8..c929ba50680bc 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -22,7 +22,7 @@ import java.sql.{Date, Timestamp}
 import scala.collection.JavaConverters._
 
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.DefaultParserDialect
+import org.apache.spark.sql.catalyst.{TableIdentifier, DefaultParserDialect}
 import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, EliminateSubQueries}
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.datasources.LogicalRelation
@@ -266,7 +266,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
 
   test("CTAS without serde") {
     def checkRelation(tableName: String, isDataSourceParquet: Boolean): Unit = {
-      val relation = EliminateSubQueries(catalog.lookupRelation(Seq(tableName)))
+      val relation = EliminateSubQueries(catalog.lookupRelation(TableIdentifier(tableName)))
       relation match {
         case LogicalRelation(r: ParquetRelation, _) =>
           if (!isDataSourceParquet) {
@@ -723,7 +723,7 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     (1 to 100).par.map { i =>
       val tableName = s"SPARK_6618_table_$i"
       sql(s"CREATE TABLE $tableName (col1 string)")
-      catalog.lookupRelation(Seq(tableName))
+      catalog.lookupRelation(TableIdentifier(tableName))
       table(tableName)
       tables()
       sql(s"DROP TABLE $tableName")
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
index 5eb39b1129701..7efeab528c1dd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcQuerySuite.scala
@@ -24,6 +24,7 @@ import org.apache.hadoop.hive.ql.io.orc.CompressionKind
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.sql._
+import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.hive.test.TestHive._
 import org.apache.spark.sql.hive.test.TestHive.implicits._
 
@@ -218,7 +219,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       sql("INSERT INTO TABLE t SELECT * FROM tmp")
       checkAnswer(table("t"), (data ++ data).map(Row.fromTuple))
     }
-    catalog.unregisterTable(Seq("tmp"))
+    catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("overwriting") {
@@ -228,7 +229,7 @@ class OrcQuerySuite extends QueryTest with BeforeAndAfterAll with OrcTest {
       sql("INSERT OVERWRITE TABLE t SELECT * FROM tmp")
       checkAnswer(table("t"), data.map(Row.fromTuple))
     }
-    catalog.unregisterTable(Seq("tmp"))
+    catalog.unregisterTable(TableIdentifier("tmp"))
   }
 
   test("self-join") {

From 2b5e31c7e97811ef7b4da47609973b7f51444346 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 14 Oct 2015 16:27:43 -0700
Subject: [PATCH 057/139] [SPARK-11113] [SQL] Remove DeveloperApi annotation
 from private classes.

o.a.s.sql.catalyst and o.a.s.sql.execution are supposed to be private.

Author: Reynold Xin <rxin@databricks.com>

Closes #9121 from rxin/SPARK-11113.
---
 .../expressions/codegen/package.scala         |  3 --
 .../spark/sql/execution/Aggregate.scala       |  3 --
 .../apache/spark/sql/execution/Exchange.scala |  5 +---
 .../spark/sql/execution/ExistingRDD.scala     |  6 +---
 .../apache/spark/sql/execution/Expand.scala   |  2 --
 .../apache/spark/sql/execution/Generate.scala |  3 --
 .../spark/sql/execution/LocalTableScan.scala  |  3 +-
 .../spark/sql/execution/QueryExecution.scala  |  7 ++---
 .../spark/sql/execution/ShuffledRowRDD.scala  |  1 -
 .../spark/sql/execution/SparkPlan.scala       |  6 ++--
 .../apache/spark/sql/execution/Window.scala   | 10 ++-----
 .../spark/sql/execution/basicOperators.scala  | 28 ++----------------
 .../apache/spark/sql/execution/commands.scala | 29 +++----------------
 .../execution/joins/BroadcastHashJoin.scala   |  3 --
 .../joins/BroadcastHashOuterJoin.scala        |  3 --
 .../joins/BroadcastLeftSemiJoinHash.scala     |  3 --
 .../joins/BroadcastNestedLoopJoin.scala       |  6 +---
 .../execution/joins/CartesianProduct.scala    |  6 +---
 .../sql/execution/joins/HashOuterJoin.scala   |  9 ++----
 .../sql/execution/joins/LeftSemiJoinBNL.scala |  3 --
 .../execution/joins/LeftSemiJoinHash.scala    |  3 --
 .../execution/joins/ShuffledHashJoin.scala    |  3 --
 .../joins/ShuffledHashOuterJoin.scala         |  3 --
 .../sql/execution/joins/SortMergeJoin.scala   |  3 --
 .../execution/joins/SortMergeOuterJoin.scala  |  3 --
 .../spark/sql/execution/joins/package.scala   |  6 ----
 .../apache/spark/sql/execution/python.scala   |  7 +----
 .../sql/execution/rowFormatConverters.scala   |  5 ----
 .../spark/sql/test/ExamplePointUDT.scala      |  3 --
 29 files changed, 22 insertions(+), 153 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
index 606fecbe06e47..41128fe389d46 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/package.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.rules
 import org.apache.spark.util.Utils
 
@@ -40,10 +39,8 @@ package object codegen {
   }
 
   /**
-   * :: DeveloperApi ::
    * Dumps the bytecode from a class to the screen using javap.
    */
-  @DeveloperApi
   object DumpByteCode {
     import scala.sys.process._
     val dumpDirectory = Utils.createTempDir()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
index f3b6a3a5f4a33..6f3f1bd97ad52 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Aggregate.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution
 
 import java.util.HashMap
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
@@ -28,7 +27,6 @@ import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Groups input data by `groupingExpressions` and computes the `aggregateExpressions` for each
  * group.
  *
@@ -38,7 +36,6 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
  * @param aggregateExpressions expressions that are computed for each group.
  * @param child the input data source.
  */
-@DeveloperApi
 case class Aggregate(
     partial: Boolean,
     groupingExpressions: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 8efa471600b1b..289453753f18d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution
 
 import java.util.Random
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.shuffle.hash.HashShuffleManager
@@ -33,13 +33,10 @@ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjectio
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.util.MutablePair
-import org.apache.spark._
 
 /**
- * :: DeveloperApi ::
  * Performs a shuffle that will result in the desired `newPartitioning`.
  */
-@DeveloperApi
 case class Exchange(newPartitioning: Partitioning, child: SparkPlan) extends UnaryNode {
 
   override def nodeName: String = if (tungstenMode) "TungstenExchange" else "Exchange"
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
index abb60cf12e3a5..87bd92e00a2c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ExistingRDD.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
@@ -27,10 +26,7 @@ import org.apache.spark.sql.sources.BaseRelation
 import org.apache.spark.sql.types.DataType
 import org.apache.spark.sql.{Row, SQLContext}
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 object RDDConversions {
   def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
     data.mapPartitions { iterator =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
index d90cae1c4c060..a458881f40948 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Expand.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
@@ -32,7 +31,6 @@ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartit
  * @param output      The output Schema
  * @param child       Child operator
  */
-@DeveloperApi
 case class Expand(
     projections: Seq[Seq[Expression]],
     output: Seq[Attribute],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
index c3c0dc441c928..78e33d9f233a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Generate.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -35,7 +34,6 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
 }
 
 /**
- * :: DeveloperApi ::
  * Applies a [[Generator]] to a stream of input rows, combining the
  * output of each into a new stream of rows.  This operation is similar to a `flatMap` in functional
  * programming with one important additional feature, which allows the input rows to be joined with
@@ -48,7 +46,6 @@ private[execution] sealed case class LazyIterator(func: () => TraversableOnce[In
  * @param output the output attributes of this node, which constructed in analysis phase,
  *               and we can not change it, as the parent node bound with it already.
  */
-@DeveloperApi
 case class Generate(
     generator: Generator,
     join: Boolean,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
index adb6bbc4acc5b..ba7f6287ac6c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/LocalTableScan.scala
@@ -18,8 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
index 7bb4133a29059..fc9174549e642 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -17,18 +17,15 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.{Experimental, DeveloperApi}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.catalyst.{InternalRow, optimizer}
-import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.SQLContext
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 
 /**
- * :: DeveloperApi ::
  * The primary workflow for executing relational queries using Spark.  Designed to allow easy
  * access to the intermediate phases of query execution for developers.
  */
-@DeveloperApi
 class QueryExecution(val sqlContext: SQLContext, val logical: LogicalPlan) {
   val analyzer = sqlContext.analyzer
   val optimizer = sqlContext.optimizer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
index 743c99a899c61..fb338b90bf79b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ShuffledRowRDD.scala
@@ -21,7 +21,6 @@ import org.apache.spark._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.types.DataType
 
 private class ShuffledRowRDDPartition(val idx: Int) extends Partition {
   override val index: Int = idx
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
index fcb42047ffe60..8bb293ae87e64 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlan.scala
@@ -22,7 +22,6 @@ import java.util.concurrent.atomic.AtomicBoolean
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{RDD, RDDOperationScope}
 import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.catalyst.InternalRow
@@ -32,7 +31,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetric, SQLMetrics}
+import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetric}
 import org.apache.spark.sql.types.DataType
 
 object SparkPlan {
@@ -40,9 +39,8 @@ object SparkPlan {
 }
 
 /**
- * :: DeveloperApi ::
+ * The base class for physical operators.
  */
-@DeveloperApi
 abstract class SparkPlan extends QueryPlan[SparkPlan] with Logging with Serializable {
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
index 55035f4bc5f2a..53c5ccf8fa37e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Window.scala
@@ -17,19 +17,14 @@
 
 package org.apache.spark.sql.execution
 
-import java.util
-
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
 import org.apache.spark.sql.types.IntegerType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.util.collection.CompactBuffer
-import scala.collection.mutable
 
 /**
- * :: DeveloperApi ::
  * This class calculates and outputs (windowed) aggregates over the rows in a single (sorted)
  * partition. The aggregates are calculated for each row in the group. Special processing
  * instructions, frames, are used to calculate these aggregates. Frames are processed in the order
@@ -76,7 +71,6 @@ import scala.collection.mutable
  * Entire Partition, Sliding, Growing & Shrinking. Boundary evaluation is also delegated to a pair
  * of specialized classes: [[RowBoundOrdering]] & [[RangeBoundOrdering]].
  */
-@DeveloperApi
 case class Window(
     projectList: Seq[Attribute],
     windowExpression: Seq[NamedExpression],
@@ -229,7 +223,7 @@ case class Window(
     // function result buffer.
     val framedWindowExprs = windowExprs.groupBy(_.windowSpec.frameSpecification)
     val factories = Array.ofDim[() => WindowFunctionFrame](framedWindowExprs.size)
-    val unboundExpressions = mutable.Buffer.empty[Expression]
+    val unboundExpressions = scala.collection.mutable.Buffer.empty[Expression]
     framedWindowExprs.zipWithIndex.foreach {
       case ((frame, unboundFrameExpressions), index) =>
         // Track the ordinal.
@@ -529,7 +523,7 @@ private[execution] final class SlidingWindowFunctionFrame(
   private[this] var inputLowIndex = 0
 
   /** Buffer used for storing prepared input for the window functions. */
-  private[this] val buffer = new util.ArrayDeque[Array[AnyRef]]
+  private[this] val buffer = new java.util.ArrayDeque[Array[AnyRef]]
 
   /** Index of the row we are currently writing. */
   private[this] var outputIndex = 0
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 7804b67ac2367..4db9f4ee67bb0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.{PartitionwiseSampledRDD, RDD, ShuffledRDD}
 import org.apache.spark.shuffle.sort.SortShuffleManager
 import org.apache.spark.sql.catalyst.InternalRow
@@ -28,10 +27,7 @@ import org.apache.spark.util.MutablePair
 import org.apache.spark.util.random.PoissonSampler
 import org.apache.spark.{HashPartitioner, SparkEnv}
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class Project(projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)
 
@@ -90,10 +86,6 @@ case class TungstenProject(projectList: Seq[NamedExpression], child: SparkPlan)
 }
 
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
 case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
@@ -125,8 +117,8 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Sample the dataset.
+ *
  * @param lowerBound Lower-bound of the sampling probability (usually 0.0)
  * @param upperBound Upper-bound of the sampling probability. The expected fraction sampled
  *                   will be ub - lb.
@@ -134,7 +126,6 @@ case class Filter(condition: Expression, child: SparkPlan) extends UnaryNode {
  * @param seed the random seed
  * @param child the SparkPlan
  */
-@DeveloperApi
 case class Sample(
     lowerBound: Double,
     upperBound: Double,
@@ -165,9 +156,8 @@ case class Sample(
 }
 
 /**
- * :: DeveloperApi ::
+ * Union two plans, without a distinct. This is UNION ALL in SQL.
  */
-@DeveloperApi
 case class Union(children: Seq[SparkPlan]) extends SparkPlan {
   // TODO: attributes output by union should be distinct for nullability purposes
   override def output: Seq[Attribute] = children.head.output
@@ -179,14 +169,12 @@ case class Union(children: Seq[SparkPlan]) extends SparkPlan {
 }
 
 /**
- * :: DeveloperApi ::
  * Take the first limit elements. Note that the implementation is different depending on whether
  * this is a terminal operator or not. If it is terminal and is invoked using executeCollect,
  * this operator uses something similar to Spark's take method on the Spark driver. If it is not
  * terminal or is invoked using execute, we first take the limit on each partition, and then
  * repartition all the data to a single partition to compute the global limit.
  */
-@DeveloperApi
 case class Limit(limit: Int, child: SparkPlan)
   extends UnaryNode {
   // TODO: Implement a partition local limit, and use a strategy to generate the proper limit plan:
@@ -219,14 +207,12 @@ case class Limit(limit: Int, child: SparkPlan)
 }
 
 /**
- * :: DeveloperApi ::
  * Take the first limit elements as defined by the sortOrder, and do projection if needed.
  * This is logically equivalent to having a [[Limit]] operator after a [[Sort]] operator,
  * or having a [[Project]] operator between them.
  * This could have been named TopK, but Spark's top operator does the opposite in ordering
  * so we name it TakeOrdered to avoid confusion.
  */
-@DeveloperApi
 case class TakeOrderedAndProject(
     limit: Int,
     sortOrder: Seq[SortOrder],
@@ -271,13 +257,11 @@ case class TakeOrderedAndProject(
 }
 
 /**
- * :: DeveloperApi ::
  * Return a new RDD that has exactly `numPartitions` partitions.
  * Similar to coalesce defined on an [[RDD]], this operation results in a narrow dependency, e.g.
  * if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of
  * the 100 new partitions will claim 10 of the current partitions.
  */
-@DeveloperApi
 case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
 
@@ -294,11 +278,9 @@ case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Returns a table with the elements from left that are not in right using
  * the built-in spark subtract function.
  */
-@DeveloperApi
 case class Except(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output
 
@@ -308,11 +290,9 @@ case class Except(left: SparkPlan, right: SparkPlan) extends BinaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Returns the rows in left that also appear in right using the built in spark
  * intersection function.
  */
-@DeveloperApi
 case class Intersect(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = children.head.output
 
@@ -322,12 +302,10 @@ case class Intersect(left: SparkPlan, right: SparkPlan) extends BinaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * A plan node that does nothing but lie about the output of its child.  Used to spice a
  * (hopefully structurally equivalent) tree from a different optimization sequence into an already
  * resolved tree.
  */
-@DeveloperApi
 case class OutputFaker(output: Seq[Attribute], child: SparkPlan) extends SparkPlan {
   def children: Seq[SparkPlan] = child :: Nil
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
index 05ccc53830bd1..856607615ae87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/commands.scala
@@ -20,11 +20,10 @@ package org.apache.spark.sql.execution
 import java.util.NoSuchElementException
 
 import org.apache.spark.Logging
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
 import org.apache.spark.sql.catalyst.errors.TreeNodeException
-import org.apache.spark.sql.catalyst.expressions.{ExpressionDescription, Expression, Attribute, AttributeReference}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.types._
@@ -74,10 +73,7 @@ private[sql] case class ExecutedCommand(cmd: RunnableCommand) extends SparkPlan
   override def argString: String = cmd.toString
 }
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableCommand with Logging {
 
   private def keyValueOutput: Seq[Attribute] = {
@@ -180,10 +176,7 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
  *
  * Note that this command takes in a logical plan, runs the optimizer on the logical plan
  * (but do NOT actually execute it).
- *
- * :: DeveloperApi ::
  */
-@DeveloperApi
 case class ExplainCommand(
     logicalPlan: LogicalPlan,
     override val output: Seq[Attribute] =
@@ -203,10 +196,7 @@ case class ExplainCommand(
   }
 }
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class CacheTableCommand(
     tableName: String,
     plan: Option[LogicalPlan],
@@ -231,10 +221,6 @@ case class CacheTableCommand(
 }
 
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
 case class UncacheTableCommand(tableName: String) extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
@@ -246,10 +232,8 @@ case class UncacheTableCommand(tableName: String) extends RunnableCommand {
 }
 
 /**
- * :: DeveloperApi ::
  * Clear all cached data from the in-memory cache.
  */
-@DeveloperApi
 case object ClearCacheCommand extends RunnableCommand {
 
   override def run(sqlContext: SQLContext): Seq[Row] = {
@@ -260,10 +244,7 @@ case object ClearCacheCommand extends RunnableCommand {
   override def output: Seq[Attribute] = Seq.empty
 }
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class DescribeCommand(
     child: SparkPlan,
     override val output: Seq[Attribute],
@@ -286,9 +267,7 @@ case class DescribeCommand(
  * {{{
  *    SHOW TABLES [IN databaseName]
  * }}}
- * :: DeveloperApi ::
  */
-@DeveloperApi
 case class ShowTablesCommand(databaseName: Option[String]) extends RunnableCommand {
 
   // The result of SHOW TABLES has two columns, tableName and isTemporary.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
index 2e108cb814516..1d381e2eaef38 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoin.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.joins
 import scala.concurrent._
 import scala.concurrent.duration._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -31,13 +30,11 @@ import org.apache.spark.util.ThreadUtils
 import org.apache.spark.{InternalAccumulator, TaskContext}
 
 /**
- * :: DeveloperApi ::
  * Performs an inner hash join of two child relations.  When the output RDD of this operator is
  * being constructed, a Spark job is asynchronously started to calculate the values for the
  * broadcasted relation.  This data is then placed in a Spark broadcast variable.  The streamed
  * relation is not shuffled.
  */
-@DeveloperApi
 case class BroadcastHashJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
index 69a8b95eaa7ec..ab81bd7b3fc04 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala
@@ -20,7 +20,6 @@ package org.apache.spark.sql.execution.joins
 import scala.concurrent._
 import scala.concurrent.duration._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -31,13 +30,11 @@ import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.{InternalAccumulator, TaskContext}
 
 /**
- * :: DeveloperApi ::
  * Performs a outer hash join for two child relations.  When the output RDD of this operator is
  * being constructed, a Spark job is asynchronously started to calculate the values for the
  * broadcasted relation.  This data is then placed in a Spark broadcast variable.  The streamed
  * relation is not shuffled.
  */
-@DeveloperApi
 case class BroadcastHashOuterJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
index 78a8c16c62bca..c5cd6a2fd6372 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastLeftSemiJoinHash.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.sql.execution.joins
 
 import org.apache.spark.{InternalAccumulator, TaskContext}
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Build the right table's join keys into a HashSet, and iteratively go through the left
  * table, to find the if join keys are in the Hash set.
  */
-@DeveloperApi
 case class BroadcastLeftSemiJoinHash(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
index 28c88b1b03d02..efef8c8a8b96a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastNestedLoopJoin.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -27,10 +26,7 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.util.collection.CompactBuffer
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class BroadcastNestedLoopJoin(
     left: SparkPlan,
     right: SparkPlan,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
index 2115f40702286..0243e196dbc37 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/CartesianProduct.scala
@@ -17,17 +17,13 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow}
 import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
-/**
- * :: DeveloperApi ::
- */
-@DeveloperApi
+
 case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
   override def output: Seq[Attribute] = left.output ++ right.output
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
index 66903347c88c1..15b06b1537f8c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import java.util.{HashMap => JavaHashMap}
-
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
@@ -27,7 +24,7 @@ import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.metric.LongSQLMetric
 import org.apache.spark.util.collection.CompactBuffer
 
-@DeveloperApi
+
 trait HashOuterJoin {
   self: SparkPlan =>
 
@@ -230,8 +227,8 @@ trait HashOuterJoin {
   protected[this] def buildHashTable(
       iter: Iterator[InternalRow],
       numIterRows: LongSQLMetric,
-      keyGenerator: Projection): JavaHashMap[InternalRow, CompactBuffer[InternalRow]] = {
-    val hashTable = new JavaHashMap[InternalRow, CompactBuffer[InternalRow]]()
+      keyGenerator: Projection): java.util.HashMap[InternalRow, CompactBuffer[InternalRow]] = {
+    val hashTable = new java.util.HashMap[InternalRow, CompactBuffer[InternalRow]]()
     while (iter.hasNext) {
       val currentRow = iter.next()
       numIterRows += 1
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
index ad6362542f2ff..efa7b49410edc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinBNL.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Using BroadcastNestedLoopJoin to calculate left semi join result when there's no join keys
  * for hash join.
  */
-@DeveloperApi
 case class LeftSemiJoinBNL(
     streamed: SparkPlan, broadcast: SparkPlan, condition: Option[Expression])
   extends BinaryNode {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
index 18808adaac63f..bf3b05be981fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/LeftSemiJoinHash.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Build the right table's join keys into a HashSet, and iteratively go through the left
  * table, to find the if join keys are in the Hash set.
  */
-@DeveloperApi
 case class LeftSemiJoinHash(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
index fc8c9439a6f07..755986af8b95e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashJoin.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution.joins
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Expression
@@ -26,11 +25,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Performs an inner hash join of two child relations by first shuffling the data using the join
  * keys.
  */
-@DeveloperApi
 case class ShuffledHashJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
index d800c7456bdac..6b2cb9d8f6893 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/ShuffledHashOuterJoin.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.joins
 
 import scala.collection.JavaConverters._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -29,11 +28,9 @@ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 
 /**
- * :: DeveloperApi ::
  * Performs a hash based outer join for two child relations by shuffling the data using
  * the join keys. This operator requires loading the associated partition in both side into memory.
  */
-@DeveloperApi
 case class ShuffledHashOuterJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
index 70a1af6a7063a..17030947b7bbc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoin.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.joins
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -28,10 +27,8 @@ import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
 import org.apache.spark.sql.execution.metric.{LongSQLMetric, SQLMetrics}
 
 /**
- * :: DeveloperApi ::
  * Performs an sort merge join of two child relations.
  */
-@DeveloperApi
 case class SortMergeJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
index c117dff9c8b1d..7e854e6702f77 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeOuterJoin.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.joins
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -30,10 +29,8 @@ import org.apache.spark.sql.execution.{BinaryNode, RowIterator, SparkPlan}
 import org.apache.spark.util.collection.BitSet
 
 /**
- * :: DeveloperApi ::
  * Performs an sort merge outer join of two child relations.
  */
-@DeveloperApi
 case class SortMergeOuterJoin(
     leftKeys: Seq[Expression],
     rightKeys: Seq[Expression],
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala
index 7f2ab1765b28f..134376628ae7f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/package.scala
@@ -17,21 +17,15 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
-
 /**
- * :: DeveloperApi ::
  * Physical execution operators for join operations.
  */
 package object joins {
 
-  @DeveloperApi
   sealed abstract class BuildSide
 
-  @DeveloperApi
   case object BuildRight extends BuildSide
 
-  @DeveloperApi
   case object BuildLeft extends BuildSide
 
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
index 5dbe0fc5f95c7..d4e6980967e82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python.scala
@@ -24,12 +24,11 @@ import scala.collection.JavaConverters._
 
 import net.razorvine.pickle._
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.api.python.{PythonRunner, PythonBroadcast, PythonRDD, SerDeUtil}
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
@@ -320,10 +319,8 @@ object EvaluatePython {
 }
 
 /**
- * :: DeveloperApi ::
  * Evaluates a [[PythonUDF]], appending the result to the end of the input tuple.
  */
-@DeveloperApi
 case class EvaluatePython(
     udf: PythonUDF,
     child: LogicalPlan,
@@ -337,7 +334,6 @@ case class EvaluatePython(
 }
 
 /**
- * :: DeveloperApi ::
  * Uses PythonRDD to evaluate a [[PythonUDF]], one partition of tuples at a time.
  *
  * Python evaluation works by sending the necessary (projected) input data via a socket to an
@@ -347,7 +343,6 @@ case class EvaluatePython(
  * we drain the queue to find the original input row. Note that if the Python process is way too
  * slow, this could lead to the queue growing unbounded and eventually run out of memory.
  */
-@DeveloperApi
 case class BatchPythonEvaluation(udf: PythonUDF, output: Seq[Attribute], child: SparkPlan)
   extends SparkPlan {
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
index 855555dd1d4c4..0e601cd2cab5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/rowFormatConverters.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -25,10 +24,8 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.catalyst.rules.Rule
 
 /**
- * :: DeveloperApi ::
  * Converts Java-object-based rows into [[UnsafeRow]]s.
  */
-@DeveloperApi
 case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode {
 
   require(UnsafeProjection.canSupport(child.schema), s"Cannot convert ${child.schema} to Unsafe")
@@ -48,10 +45,8 @@ case class ConvertToUnsafe(child: SparkPlan) extends UnaryNode {
 }
 
 /**
- * :: DeveloperApi ::
  * Converts [[UnsafeRow]]s back into Java-object-based rows.
  */
-@DeveloperApi
 case class ConvertToSafe(child: SparkPlan) extends UnaryNode {
   override def output: Seq[Attribute] = child.output
   override def outputPartitioning: Partitioning = child.outputPartitioning
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
index 963e6030c14c8..a741a45f1c527 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/test/ExamplePointUDT.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.sql.test
 
-import java.util
-
-import scala.collection.JavaConverters._
 import org.apache.spark.sql.types._
 
 /**

From 1baaf2b9bd7c949a8f95cd14fc1be2a56e1139b3 Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 14 Oct 2015 16:29:32 -0700
Subject: [PATCH 058/139] [SPARK-10829] [SQL] Filter combine partition key and
 attribute doesn't work in DataSource scan

```scala
withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
      withTempPath { dir =>
        val path = s"${dir.getCanonicalPath}/part=1"
        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)

        // If the "part = 1" filter gets pushed down, this query will throw an exception since
        // "part" is not a valid column in the actual Parquet file
        checkAnswer(
          sqlContext.read.parquet(path).filter("a > 0 and (part = 0 or a > 1)"),
          (2 to 3).map(i => Row(i, i.toString, 1)))
      }
    }
```

We expect the result to be:
```
2,1
3,1
```
But got
```
1,1
2,1
3,1
```

Author: Cheng Hao <hao.cheng@intel.com>

Closes #8916 from chenghao-intel/partition_filter.
---
 .../datasources/DataSourceStrategy.scala      | 34 ++++++++++++-------
 .../parquet/ParquetFilterSuite.scala          | 17 ++++++++++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 918db8e7d083e..33181fa6c065f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -62,7 +62,22 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     // Scanning partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _))
         if t.partitionSpec.partitionColumns.nonEmpty =>
-      val selectedPartitions = prunePartitions(filters, t.partitionSpec).toArray
+      // We divide the filter expressions into 3 parts
+      val partitionColumnNames = t.partitionSpec.partitionColumns.map(_.name).toSet
+
+      // TODO this is case-sensitive
+      // Only prunning the partition keys
+      val partitionFilters =
+        filters.filter(_.references.map(_.name).toSet.subsetOf(partitionColumnNames))
+
+      // Only pushes down predicates that do not reference partition keys.
+      val pushedFilters =
+        filters.filter(_.references.map(_.name).toSet.intersect(partitionColumnNames).isEmpty)
+
+      // Predicates with both partition keys and attributes
+      val combineFilters = filters.toSet -- partitionFilters.toSet -- pushedFilters.toSet
+
+      val selectedPartitions = prunePartitions(partitionFilters, t.partitionSpec).toArray
 
       logInfo {
         val total = t.partitionSpec.partitions.length
@@ -71,21 +86,16 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
         s"Selected $selected partitions out of $total, pruned $percentPruned% partitions."
       }
 
-      // Only pushes down predicates that do not reference partition columns.
-      val pushedFilters = {
-        val partitionColumnNames = t.partitionSpec.partitionColumns.map(_.name).toSet
-        filters.filter { f =>
-          val referencedColumnNames = f.references.map(_.name).toSet
-          referencedColumnNames.intersect(partitionColumnNames).isEmpty
-        }
-      }
-
-      buildPartitionedTableScan(
+      val scan = buildPartitionedTableScan(
         l,
         projects,
         pushedFilters,
         t.partitionSpec.partitionColumns,
-        selectedPartitions) :: Nil
+        selectedPartitions)
+
+      combineFilters
+        .reduceLeftOption(expressions.And)
+        .map(execution.Filter(_, scan)).getOrElse(scan) :: Nil
 
     // Scanning non-partitioned HadoopFsRelation
     case PhysicalOperation(projects, filters, l @ LogicalRelation(t: HadoopFsRelation, _)) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 45ad3fde559c0..7a23f57f40392 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -297,4 +297,21 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
       }
     }
   }
+
+  test("SPARK-10829: Filter combine partition key and attribute doesn't work in DataSource scan") {
+    import testImplicits._
+
+    withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
+      withTempPath { dir =>
+        val path = s"${dir.getCanonicalPath}/part=1"
+        (1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
+
+        // If the "part = 1" filter gets pushed down, this query will throw an exception since
+        // "part" is not a valid column in the actual Parquet file
+        checkAnswer(
+          sqlContext.read.parquet(path).filter("a > 0 and (part = 0 or a > 1)"),
+          (2 to 3).map(i => Row(i, i.toString, 1)))
+      }
+    }
+  }
 }

From 4ace4f8a9c91beb21a0077e12b75637a4560a542 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 14 Oct 2015 17:27:27 -0700
Subject: [PATCH 059/139] [SPARK-11017] [SQL] Support ImperativeAggregates in
 TungstenAggregate

This patch extends TungstenAggregate to support ImperativeAggregate functions. The existing TungstenAggregate operator only supported DeclarativeAggregate functions, which are defined in terms of Catalyst expressions and can be evaluated via generated projections. ImperativeAggregate functions, on the other hand, are evaluated by calling their `initialize`, `update`, `merge`, and `eval` methods.

The basic strategy here is similar to how SortBasedAggregate evaluates both types of aggregate functions: use a generated projection to evaluate the expression-based declarative aggregates with dummy placeholder expressions inserted in place of the imperative aggregate function output, then invoke the imperative aggregate functions and target them against the aggregation buffer. The bulk of the diff here consists of code that was copied and adapted from SortBasedAggregate, with some key changes to handle TungstenAggregate's sort fallback path.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9038 from JoshRosen/support-interpreted-in-tungsten-agg-final.
---
 .../expressions/aggregate/functions.scala     |  19 +-
 .../expressions/aggregate/interfaces.scala    |  31 +-
 .../aggregate/AggregationIterator.scala       |  29 +-
 .../aggregate/TungstenAggregate.scala         |  22 +-
 .../TungstenAggregationIterator.scala         | 250 ++++++++++++----
 .../spark/sql/execution/aggregate/udaf.scala  |  79 +++--
 .../spark/sql/execution/aggregate/utils.scala | 269 +++++++++---------
 .../TungstenAggregationIteratorSuite.scala    |   2 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala  |  16 +-
 9 files changed, 457 insertions(+), 260 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index 8aad0b7dee054..c0bc7ec09c34a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -472,10 +472,20 @@ case class Sum(child: Expression) extends DeclarativeAggregate {
  * @param relativeSD the maximum estimation error allowed.
  */
 // scalastyle:on
-case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
-    extends ImperativeAggregate {
+case class HyperLogLogPlusPlus(
+    child: Expression,
+    relativeSD: Double = 0.05,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
+  extends ImperativeAggregate {
   import HyperLogLogPlusPlus._
 
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
   /**
    * HLL++ uses 'p' bits for addressing. The more addressing bits we use, the more precise the
    * algorithm will be, and the more memory it will require. The 'p' value is based on the relative
@@ -546,6 +556,11 @@ case class HyperLogLogPlusPlus(child: Expression, relativeSD: Double = 0.05)
     AttributeReference(s"MS[$i]", LongType)()
   }
 
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
+
   /** Fill all words with zeros. */
   override def initialize(buffer: MutableRow): Unit = {
     var word = 0
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 9ba3a9c980457..a2fab258fcac3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -150,6 +150,10 @@ sealed abstract class AggregateFunction2 extends Expression with ImplicitCastInp
  * We need to perform similar field number arithmetic when merging multiple intermediate
  * aggregate buffers together in `merge()` (in this case, use `inputAggBufferOffset` when accessing
  * the input buffer).
+ *
+ * Correct ImperativeAggregate evaluation depends on the correctness of `mutableAggBufferOffset` and
+ * `inputAggBufferOffset`, but not on the correctness of the attribute ids in `aggBufferAttributes`
+ * and `inputAggBufferAttributes`.
  */
 abstract class ImperativeAggregate extends AggregateFunction2 {
 
@@ -172,11 +176,13 @@ abstract class ImperativeAggregate extends AggregateFunction2 {
    *                     avg(y) mutableAggBufferOffset = 2
    *
    */
-  protected var mutableAggBufferOffset: Int = 0
+  protected val mutableAggBufferOffset: Int
 
-  def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): Unit = {
-    mutableAggBufferOffset = newMutableAggBufferOffset
-  }
+  /**
+   * Returns a copy of this ImperativeAggregate with an updated mutableAggBufferOffset.
+   * This new copy's attributes may have different ids than the original.
+   */
+  def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate
 
   /**
    * The offset of this function's start buffer value in the underlying shared input aggregation
@@ -203,11 +209,17 @@ abstract class ImperativeAggregate extends AggregateFunction2 {
    *                       avg(y) inputAggBufferOffset = 3
    *
    */
-  protected var inputAggBufferOffset: Int = 0
+  protected val inputAggBufferOffset: Int
 
-  def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): Unit = {
-    inputAggBufferOffset = newInputAggBufferOffset
-  }
+  /**
+   * Returns a copy of this ImperativeAggregate with an updated mutableAggBufferOffset.
+   * This new copy's attributes may have different ids than the original.
+   */
+  def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate
+
+  // Note: although all subclasses implement inputAggBufferAttributes by simply cloning
+  // aggBufferAttributes, that common clone code cannot be placed here in the abstract
+  // ImperativeAggregate class, since that will lead to initialization ordering issues.
 
   /**
    * Initializes the mutable aggregation buffer located in `mutableAggBuffer`.
@@ -231,9 +243,6 @@ abstract class ImperativeAggregate extends AggregateFunction2 {
    * Use `fieldNumber + inputAggBufferOffset` to access fields of `inputAggBuffer`.
    */
   def merge(mutableAggBuffer: MutableRow, inputAggBuffer: InternalRow): Unit
-
-  final lazy val inputAggBufferAttributes: Seq[AttributeReference] =
-    aggBufferAttributes.map(_.newInstance())
 }
 
 /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
index 8e0fbd109b413..99fb7a40b72e1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggregationIterator.scala
@@ -83,7 +83,7 @@ abstract class AggregationIterator(
     var i = 0
     while (i < allAggregateExpressions.length) {
       val func = allAggregateExpressions(i).aggregateFunction
-      val funcWithBoundReferences = allAggregateExpressions(i).mode match {
+      val funcWithBoundReferences: AggregateFunction2 = allAggregateExpressions(i).mode match {
         case Partial | Complete if func.isInstanceOf[ImperativeAggregate] =>
           // We need to create BoundReferences if the function is not an
           // expression-based aggregate function (it does not support code-gen) and the mode of
@@ -94,24 +94,24 @@ abstract class AggregationIterator(
         case _ =>
           // We only need to set inputBufferOffset for aggregate functions with mode
           // PartialMerge and Final.
-          func match {
+          val updatedFunc = func match {
             case function: ImperativeAggregate =>
               function.withNewInputAggBufferOffset(inputBufferOffset)
-            case _ =>
+            case function => function
           }
           inputBufferOffset += func.aggBufferSchema.length
-          func
+          updatedFunc
       }
-      // Set mutableBufferOffset for this function. It is important that setting
-      // mutableBufferOffset happens after all potential bindReference operations
-      // because bindReference will create a new instance of the function.
-      funcWithBoundReferences match {
+      val funcWithUpdatedAggBufferOffset = funcWithBoundReferences match {
         case function: ImperativeAggregate =>
+          // Set mutableBufferOffset for this function. It is important that setting
+          // mutableBufferOffset happens after all potential bindReference operations
+          // because bindReference will create a new instance of the function.
           function.withNewMutableAggBufferOffset(mutableBufferOffset)
-        case _ =>
+        case function => function
       }
-      mutableBufferOffset += funcWithBoundReferences.aggBufferSchema.length
-      functions(i) = funcWithBoundReferences
+      mutableBufferOffset += funcWithUpdatedAggBufferOffset.aggBufferSchema.length
+      functions(i) = funcWithUpdatedAggBufferOffset
       i += 1
     }
     functions
@@ -320,7 +320,7 @@ abstract class AggregationIterator(
   // Initializing the function used to generate the output row.
   protected val generateOutput: (InternalRow, MutableRow) => InternalRow = {
     val rowToBeEvaluated = new JoinedRow
-    val safeOutputRow = new GenericMutableRow(resultExpressions.length)
+    val safeOutputRow = new SpecificMutableRow(resultExpressions.map(_.dataType))
     val mutableOutput = if (outputsUnsafeRows) {
       UnsafeProjection.create(resultExpressions.map(_.dataType).toArray).apply(safeOutputRow)
     } else {
@@ -358,7 +358,8 @@ abstract class AggregationIterator(
         val expressionAggEvalProjection = newMutableProjection(evalExpressions, bufferSchemata)()
         val aggregateResultSchema = nonCompleteAggregateAttributes ++ completeAggregateAttributes
         // TODO: Use unsafe row.
-        val aggregateResult = new GenericMutableRow(aggregateResultSchema.length)
+        val aggregateResult = new SpecificMutableRow(aggregateResultSchema.map(_.dataType))
+        expressionAggEvalProjection.target(aggregateResult)
         val resultProjection =
           newMutableProjection(
             resultExpressions, groupingKeyAttributes ++ aggregateResultSchema)()
@@ -366,7 +367,7 @@ abstract class AggregationIterator(
 
         (currentGroupingKey: InternalRow, currentBuffer: MutableRow) => {
           // Generate results for all expression-based aggregate functions.
-          expressionAggEvalProjection.target(aggregateResult)(currentBuffer)
+          expressionAggEvalProjection(currentBuffer)
           // Generate results for all imperative aggregate functions.
           var i = 0
           while (i < allImperativeAggregateFunctions.length) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index 7b3d072b2e067..c342940e6e757 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -24,8 +24,9 @@ import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression2
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical._
-import org.apache.spark.sql.execution.{UnaryNode, SparkPlan}
+import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, UnaryNode, SparkPlan}
 import org.apache.spark.sql.execution.metric.SQLMetrics
+import org.apache.spark.sql.types.StructType
 
 case class TungstenAggregate(
     requiredChildDistributionExpressions: Option[Seq[Expression]],
@@ -34,10 +35,18 @@ case class TungstenAggregate(
     nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
     completeAggregateAttributes: Seq[Attribute],
+    initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     child: SparkPlan)
   extends UnaryNode {
 
+  private[this] val aggregateBufferAttributes = {
+    (nonCompleteAggregateExpressions ++ completeAggregateExpressions)
+      .flatMap(_.aggregateFunction.aggBufferAttributes)
+  }
+
+  require(TungstenAggregate.supportsAggregate(groupingExpressions, aggregateBufferAttributes))
+
   override private[sql] lazy val metrics = Map(
     "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"),
     "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
@@ -82,6 +91,7 @@ case class TungstenAggregate(
         nonCompleteAggregateAttributes,
         completeAggregateExpressions,
         completeAggregateAttributes,
+        initialInputBufferOffset,
         resultExpressions,
         newMutableProjection,
         child.output,
@@ -138,3 +148,13 @@ case class TungstenAggregate(
     }
   }
 }
+
+object TungstenAggregate {
+  def supportsAggregate(
+    groupingExpressions: Seq[Expression],
+    aggregateBufferAttributes: Seq[Attribute]): Boolean = {
+    val aggregationBufferSchema = StructType.fromAttributes(aggregateBufferAttributes)
+    UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) &&
+      UnsafeProjection.canSupport(groupingExpressions)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index 4bb95c9eb7f3e..fe708a5f71f79 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.execution.aggregate
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.unsafe.KVIterator
 import org.apache.spark.{InternalAccumulator, Logging, SparkEnv, TaskContext}
 import org.apache.spark.sql.catalyst.expressions._
@@ -79,6 +81,7 @@ class TungstenAggregationIterator(
     nonCompleteAggregateAttributes: Seq[Attribute],
     completeAggregateExpressions: Seq[AggregateExpression2],
     completeAggregateAttributes: Seq[Attribute],
+    initialInputBufferOffset: Int,
     resultExpressions: Seq[NamedExpression],
     newMutableProjection: (Seq[Expression], Seq[Attribute]) => (() => MutableProjection),
     originalInputAttributes: Seq[Attribute],
@@ -134,19 +137,74 @@ class TungstenAggregationIterator(
       completeAggregateExpressions.map(_.mode).distinct.headOption
   }
 
-  // All aggregate functions. TungstenAggregationIterator only handles expression-based aggregate.
-  // If there is any functions that is an ImperativeAggregateFunction, we throw an
-  // IllegalStateException.
-  private[this] val allAggregateFunctions: Array[DeclarativeAggregate] = {
-    if (!allAggregateExpressions.forall(
-        _.aggregateFunction.isInstanceOf[DeclarativeAggregate])) {
-      throw new IllegalStateException(
-        "Only ExpressionAggregateFunctions should be passed in TungstenAggregationIterator.")
+  // Initialize all AggregateFunctions by binding references, if necessary,
+  // and setting inputBufferOffset and mutableBufferOffset.
+  private def initializeAllAggregateFunctions(
+      startingInputBufferOffset: Int): Array[AggregateFunction2] = {
+    var mutableBufferOffset = 0
+    var inputBufferOffset: Int = startingInputBufferOffset
+    val functions = new Array[AggregateFunction2](allAggregateExpressions.length)
+    var i = 0
+    while (i < allAggregateExpressions.length) {
+      val func = allAggregateExpressions(i).aggregateFunction
+      val aggregateExpressionIsNonComplete = i < nonCompleteAggregateExpressions.length
+      // We need to use this mode instead of func.mode in order to handle aggregation mode switching
+      // when switching to sort-based aggregation:
+      val mode = if (aggregateExpressionIsNonComplete) aggregationMode._1 else aggregationMode._2
+      val funcWithBoundReferences = mode match {
+        case Some(Partial) | Some(Complete) if func.isInstanceOf[ImperativeAggregate] =>
+          // We need to create BoundReferences if the function is not an
+          // expression-based aggregate function (it does not support code-gen) and the mode of
+          // this function is Partial or Complete because we will call eval of this
+          // function's children in the update method of this aggregate function.
+          // Those eval calls require BoundReferences to work.
+          BindReferences.bindReference(func, originalInputAttributes)
+        case _ =>
+          // We only need to set inputBufferOffset for aggregate functions with mode
+          // PartialMerge and Final.
+          val updatedFunc = func match {
+            case function: ImperativeAggregate =>
+              function.withNewInputAggBufferOffset(inputBufferOffset)
+            case function => function
+          }
+          inputBufferOffset += func.aggBufferSchema.length
+          updatedFunc
+      }
+      val funcWithUpdatedAggBufferOffset = funcWithBoundReferences match {
+        case function: ImperativeAggregate =>
+          // Set mutableBufferOffset for this function. It is important that setting
+          // mutableBufferOffset happens after all potential bindReference operations
+          // because bindReference will create a new instance of the function.
+          function.withNewMutableAggBufferOffset(mutableBufferOffset)
+        case function => function
+      }
+      mutableBufferOffset += funcWithUpdatedAggBufferOffset.aggBufferSchema.length
+      functions(i) = funcWithUpdatedAggBufferOffset
+      i += 1
     }
+    functions
+  }
 
-    allAggregateExpressions
-      .map(_.aggregateFunction.asInstanceOf[DeclarativeAggregate])
-      .toArray
+  private[this] var allAggregateFunctions: Array[AggregateFunction2] =
+    initializeAllAggregateFunctions(initialInputBufferOffset)
+
+  // Positions of those imperative aggregate functions in allAggregateFunctions.
+  // For example, say that we have func1, func2, func3, func4 in aggregateFunctions, and
+  // func2 and func3 are imperative aggregate functions. Then
+  // allImperativeAggregateFunctionPositions will be [1, 2]. Note that this does not need to be
+  // updated when falling back to sort-based aggregation because the positions of the aggregate
+  // functions do not change in that case.
+  private[this] val allImperativeAggregateFunctionPositions: Array[Int] = {
+    val positions = new ArrayBuffer[Int]()
+    var i = 0
+    while (i < allAggregateFunctions.length) {
+      allAggregateFunctions(i) match {
+        case agg: DeclarativeAggregate =>
+        case _ => positions += i
+      }
+      i += 1
+    }
+    positions.toArray
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -155,25 +213,31 @@ class TungstenAggregationIterator(
   //         rows.
   ///////////////////////////////////////////////////////////////////////////
 
-  // The projection used to initialize buffer values.
-  private[this] val initialProjection: MutableProjection = {
-    val initExpressions = allAggregateFunctions.flatMap(_.initialValues)
+  // The projection used to initialize buffer values for all expression-based aggregates.
+  // Note that this projection does not need to be updated when switching to sort-based aggregation
+  // because the schema of empty aggregation buffers does not change in that case.
+  private[this] val expressionAggInitialProjection: MutableProjection = {
+    val initExpressions = allAggregateFunctions.flatMap {
+      case ae: DeclarativeAggregate => ae.initialValues
+      // For the positions corresponding to imperative aggregate functions, we'll use special
+      // no-op expressions which are ignored during projection code-generation.
+      case i: ImperativeAggregate => Seq.fill(i.aggBufferAttributes.length)(NoOp)
+    }
     newMutableProjection(initExpressions, Nil)()
   }
 
   // Creates a new aggregation buffer and initializes buffer values.
-  // This functions should be only called at most three times (when we create the hash map,
+  // This function should be only called at most three times (when we create the hash map,
   // when we switch to sort-based aggregation, and when we create the re-used buffer for
   // sort-based aggregation).
   private def createNewAggregationBuffer(): UnsafeRow = {
     val bufferSchema = allAggregateFunctions.flatMap(_.aggBufferAttributes)
-    val bufferRowSize: Int = bufferSchema.length
-
-    val genericMutableBuffer = new GenericMutableRow(bufferRowSize)
-    val unsafeProjection =
-      UnsafeProjection.create(bufferSchema.map(_.dataType))
-    val buffer = unsafeProjection.apply(genericMutableBuffer)
-    initialProjection.target(buffer)(EmptyRow)
+    val buffer: UnsafeRow = UnsafeProjection.create(bufferSchema.map(_.dataType))
+      .apply(new GenericMutableRow(bufferSchema.length))
+    // Initialize declarative aggregates' buffer values
+    expressionAggInitialProjection.target(buffer)(EmptyRow)
+    // Initialize imperative aggregates' buffer values
+    allAggregateFunctions.collect { case f: ImperativeAggregate => f }.foreach(_.initialize(buffer))
     buffer
   }
 
@@ -187,72 +251,124 @@ class TungstenAggregationIterator(
     aggregationMode match {
       // Partial-only
       case (Some(Partial), None) =>
-        val updateExpressions = allAggregateFunctions.flatMap(_.updateExpressions)
-        val updateProjection =
+        val updateExpressions = allAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.updateExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
+        val imperativeAggregateFunctions: Array[ImperativeAggregate] =
+          allAggregateFunctions.collect { case func: ImperativeAggregate => func}
+        val expressionAggUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          updateProjection.target(currentBuffer)
-          updateProjection(joinedRow(currentBuffer, row))
+          expressionAggUpdateProjection.target(currentBuffer)
+          // Process all expression-based aggregate functions.
+          expressionAggUpdateProjection(joinedRow(currentBuffer, row))
+          // Process all imperative aggregate functions
+          var i = 0
+          while (i < imperativeAggregateFunctions.length) {
+            imperativeAggregateFunctions(i).update(currentBuffer, row)
+            i += 1
+          }
         }
 
       // PartialMerge-only or Final-only
       case (Some(PartialMerge), None) | (Some(Final), None) =>
-        val mergeExpressions = allAggregateFunctions.flatMap(_.mergeExpressions)
-        val mergeProjection =
+        val mergeExpressions = allAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.mergeExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
+        val imperativeAggregateFunctions: Array[ImperativeAggregate] =
+          allAggregateFunctions.collect { case func: ImperativeAggregate => func}
+        // This projection is used to merge buffer values for all expression-based aggregates.
+        val expressionAggMergeProjection =
           newMutableProjection(mergeExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          mergeProjection.target(currentBuffer)
-          mergeProjection(joinedRow(currentBuffer, row))
+          // Process all expression-based aggregate functions.
+          expressionAggMergeProjection.target(currentBuffer)(joinedRow(currentBuffer, row))
+          // Process all imperative aggregate functions.
+          var i = 0
+          while (i < imperativeAggregateFunctions.length) {
+            imperativeAggregateFunctions(i).merge(currentBuffer, row)
+            i += 1
+          }
         }
 
       // Final-Complete
       case (Some(Final), Some(Complete)) =>
-        val nonCompleteAggregateFunctions: Array[DeclarativeAggregate] =
-          allAggregateFunctions.take(nonCompleteAggregateExpressions.length)
-        val completeAggregateFunctions: Array[DeclarativeAggregate] =
+        val completeAggregateFunctions: Array[AggregateFunction2] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
+        val completeImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          completeAggregateFunctions.collect { case func: ImperativeAggregate => func }
+        val nonCompleteAggregateFunctions: Array[AggregateFunction2] =
+          allAggregateFunctions.take(nonCompleteAggregateExpressions.length)
+        val nonCompleteImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          nonCompleteAggregateFunctions.collect { case func: ImperativeAggregate => func }
 
         val completeOffsetExpressions =
           Seq.fill(completeAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
         val mergeExpressions =
-          nonCompleteAggregateFunctions.flatMap(_.mergeExpressions) ++ completeOffsetExpressions
+          nonCompleteAggregateFunctions.flatMap {
+            case ae: DeclarativeAggregate => ae.mergeExpressions
+            case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+          } ++ completeOffsetExpressions
         val finalMergeProjection =
           newMutableProjection(mergeExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         // We do not touch buffer values of aggregate functions with the Final mode.
         val finalOffsetExpressions =
           Seq.fill(nonCompleteAggregateFunctions.map(_.aggBufferAttributes.length).sum)(NoOp)
-        val updateExpressions =
-          finalOffsetExpressions ++ completeAggregateFunctions.flatMap(_.updateExpressions)
+        val updateExpressions = finalOffsetExpressions ++ completeAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.updateExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
         val completeUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
           val input = joinedRow(currentBuffer, row)
-          // For all aggregate functions with mode Complete, update the given currentBuffer.
+          // For all aggregate functions with mode Complete, update buffers.
           completeUpdateProjection.target(currentBuffer)(input)
+          var i = 0
+          while (i < completeImperativeAggregateFunctions.length) {
+            completeImperativeAggregateFunctions(i).update(currentBuffer, row)
+            i += 1
+          }
 
           // For all aggregate functions with mode Final, merge buffer values in row to
           // currentBuffer.
           finalMergeProjection.target(currentBuffer)(input)
+          i = 0
+          while (i < nonCompleteImperativeAggregateFunctions.length) {
+            nonCompleteImperativeAggregateFunctions(i).merge(currentBuffer, row)
+            i += 1
+          }
         }
 
       // Complete-only
       case (None, Some(Complete)) =>
-        val completeAggregateFunctions: Array[DeclarativeAggregate] =
+        val completeAggregateFunctions: Array[AggregateFunction2] =
           allAggregateFunctions.takeRight(completeAggregateExpressions.length)
+        // All imperative aggregate functions with mode Complete.
+        val completeImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          completeAggregateFunctions.collect { case func: ImperativeAggregate => func }
 
-        val updateExpressions =
-          completeAggregateFunctions.flatMap(_.updateExpressions)
-        val completeUpdateProjection =
+        val updateExpressions = completeAggregateFunctions.flatMap {
+          case ae: DeclarativeAggregate => ae.updateExpressions
+          case agg: AggregateFunction2 => Seq.fill(agg.aggBufferAttributes.length)(NoOp)
+        }
+        val completeExpressionAggUpdateProjection =
           newMutableProjection(updateExpressions, aggregationBufferAttributes ++ inputAttributes)()
 
         (currentBuffer: UnsafeRow, row: InternalRow) => {
-          completeUpdateProjection.target(currentBuffer)
-          // For all aggregate functions with mode Complete, update the given currentBuffer.
-          completeUpdateProjection(joinedRow(currentBuffer, row))
+          // For all aggregate functions with mode Complete, update buffers.
+          completeExpressionAggUpdateProjection.target(currentBuffer)(joinedRow(currentBuffer, row))
+          var i = 0
+          while (i < completeImperativeAggregateFunctions.length) {
+            completeImperativeAggregateFunctions(i).update(currentBuffer, row)
+            i += 1
+          }
         }
 
       // Grouping only.
@@ -288,17 +404,30 @@ class TungstenAggregationIterator(
         val joinedRow = new JoinedRow()
         val evalExpressions = allAggregateFunctions.map {
           case ae: DeclarativeAggregate => ae.evaluateExpression
-          // case agg: AggregateFunction2 => Literal.create(null, agg.dataType)
+          case agg: AggregateFunction2 => NoOp
         }
-        val expressionAggEvalProjection = UnsafeProjection.create(evalExpressions, bufferAttributes)
+        val expressionAggEvalProjection = newMutableProjection(evalExpressions, bufferAttributes)()
         // These are the attributes of the row produced by `expressionAggEvalProjection`
         val aggregateResultSchema = nonCompleteAggregateAttributes ++ completeAggregateAttributes
+        val aggregateResult = new SpecificMutableRow(aggregateResultSchema.map(_.dataType))
+        expressionAggEvalProjection.target(aggregateResult)
         val resultProjection =
           UnsafeProjection.create(resultExpressions, groupingAttributes ++ aggregateResultSchema)
 
+        val allImperativeAggregateFunctions: Array[ImperativeAggregate] =
+          allAggregateFunctions.collect { case func: ImperativeAggregate => func}
+
         (currentGroupingKey: UnsafeRow, currentBuffer: UnsafeRow) => {
           // Generate results for all expression-based aggregate functions.
-          val aggregateResult = expressionAggEvalProjection.apply(currentBuffer)
+          expressionAggEvalProjection(currentBuffer)
+          // Generate results for all imperative aggregate functions.
+          var i = 0
+          while (i < allImperativeAggregateFunctions.length) {
+            aggregateResult.update(
+              allImperativeAggregateFunctionPositions(i),
+              allImperativeAggregateFunctions(i).eval(currentBuffer))
+            i += 1
+          }
           resultProjection(joinedRow(currentGroupingKey, aggregateResult))
         }
 
@@ -481,10 +610,27 @@ class TungstenAggregationIterator(
       // When needsProcess is false, the format of input rows is groupingKey + aggregation buffer.
       // We need to project the aggregation buffer part from an input row.
       val buffer = createNewAggregationBuffer()
-      // The originalInputAttributes are using cloneBufferAttributes. So, we need to use
-      // allAggregateFunctions.flatMap(_.cloneBufferAttributes).
+      // In principle, we could use `allAggregateFunctions.flatMap(_.inputAggBufferAttributes)` to
+      // extract the aggregation buffer. In practice, however, we extract it positionally by relying
+      // on it being present at the end of the row. The reason for this relates to how the different
+      // aggregates handle input binding.
+      //
+      // ImperativeAggregate uses field numbers and field number offsets to manipulate its buffers,
+      // so its correctness does not rely on attribute bindings. When we fall back to sort-based
+      // aggregation, these field number offsets (mutableAggBufferOffset and inputAggBufferOffset)
+      // need to be updated and any internal state in the aggregate functions themselves must be
+      // reset, so we call withNewMutableAggBufferOffset and withNewInputAggBufferOffset to reset
+      // this state and update the offsets.
+      //
+      // The updated ImperativeAggregate will have different attribute ids for its
+      // aggBufferAttributes and inputAggBufferAttributes. This isn't a problem for the actual
+      // ImperativeAggregate evaluation, but it means that
+      // `allAggregateFunctions.flatMap(_.inputAggBufferAttributes)` will no longer match the
+      // attributes in `originalInputAttributes`, which is why we can't use those attributes here.
+      //
+      // For more details, see the discussion on PR #9038.
       val bufferExtractor = newMutableProjection(
-        allAggregateFunctions.flatMap(_.inputAggBufferAttributes),
+        originalInputAttributes.drop(initialInputBufferOffset),
         originalInputAttributes)()
       bufferExtractor.target(buffer)
 
@@ -511,8 +657,10 @@ class TungstenAggregationIterator(
     }
     aggregationMode = newAggregationMode
 
+    allAggregateFunctions = initializeAllAggregateFunctions(startingInputBufferOffset = 0)
+
     // Basically the value of the KVIterator returned by externalSorter
-    // will just aggregation buffer. At here, we use cloneBufferAttributes.
+    // will just aggregation buffer. At here, we use inputAggBufferAttributes.
     val newInputAttributes: Seq[Attribute] =
       allAggregateFunctions.flatMap(_.inputAggBufferAttributes)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
index fd02be1225f27..d2f56e0fc14a4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/udaf.scala
@@ -321,9 +321,17 @@ private[sql] class InputAggregationBuffer private[sql] (
  */
 private[sql] case class ScalaUDAF(
     children: Seq[Expression],
-    udaf: UserDefinedAggregateFunction)
+    udaf: UserDefinedAggregateFunction,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
   extends ImperativeAggregate with Logging {
 
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
+
   require(
     children.length == udaf.inputSchema.length,
     s"$udaf only accepts ${udaf.inputSchema.length} arguments, " +
@@ -341,6 +349,11 @@ private[sql] case class ScalaUDAF(
 
   override val aggBufferAttributes: Seq[AttributeReference] = aggBufferSchema.toAttributes
 
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override val inputAggBufferAttributes: Seq[AttributeReference] =
+    aggBufferAttributes.map(_.newInstance())
+
   private[this] lazy val childrenSchema: StructType = {
     val inputFields = children.zipWithIndex.map {
       case (child, index) =>
@@ -382,51 +395,33 @@ private[sql] case class ScalaUDAF(
   }
 
   // This buffer is only used at executor side.
-  private[this] var inputAggregateBuffer: InputAggregationBuffer = null
-
-  // This buffer is only used at executor side.
-  private[this] var mutableAggregateBuffer: MutableAggregationBufferImpl = null
+  private[this] lazy val inputAggregateBuffer: InputAggregationBuffer = {
+    new InputAggregationBuffer(
+      aggBufferSchema,
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      inputAggBufferOffset,
+      null)
+  }
 
   // This buffer is only used at executor side.
-  private[this] var evalAggregateBuffer: InputAggregationBuffer = null
-
-  /**
-   * Sets the inputBufferOffset to newInputBufferOffset and then create a new instance of
-   * `inputAggregateBuffer` based on this new inputBufferOffset.
-   */
-  override def withNewInputAggBufferOffset(newInputBufferOffset: Int): Unit = {
-    super.withNewInputAggBufferOffset(newInputBufferOffset)
-    // inputBufferOffset has been updated.
-    inputAggregateBuffer =
-      new InputAggregationBuffer(
-        aggBufferSchema,
-        bufferValuesToCatalystConverters,
-        bufferValuesToScalaConverters,
-        inputAggBufferOffset,
-        null)
+  private[this] lazy val mutableAggregateBuffer: MutableAggregationBufferImpl = {
+    new MutableAggregationBufferImpl(
+      aggBufferSchema,
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      mutableAggBufferOffset,
+      null)
   }
 
-  /**
-   * Sets the mutableBufferOffset to newMutableBufferOffset and then create a new instance of
-   * `mutableAggregateBuffer` and `evalAggregateBuffer` based on this new mutableBufferOffset.
-   */
-  override def withNewMutableAggBufferOffset(newMutableBufferOffset: Int): Unit = {
-    super.withNewMutableAggBufferOffset(newMutableBufferOffset)
-    // mutableBufferOffset has been updated.
-    mutableAggregateBuffer =
-      new MutableAggregationBufferImpl(
-        aggBufferSchema,
-        bufferValuesToCatalystConverters,
-        bufferValuesToScalaConverters,
-        mutableAggBufferOffset,
-        null)
-    evalAggregateBuffer =
-      new InputAggregationBuffer(
-        aggBufferSchema,
-        bufferValuesToCatalystConverters,
-        bufferValuesToScalaConverters,
-        mutableAggBufferOffset,
-        null)
+  // This buffer is only used at executor side.
+  private[this] lazy val evalAggregateBuffer: InputAggregationBuffer = {
+    new InputAggregationBuffer(
+      aggBufferSchema,
+      bufferValuesToCatalystConverters,
+      bufferValuesToScalaConverters,
+      mutableAggBufferOffset,
+      null)
   }
 
   override def initialize(buffer: MutableRow): Unit = {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
index cf6e7ed0d337f..eaafd83158a15 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/utils.scala
@@ -19,21 +19,12 @@ package org.apache.spark.sql.execution.aggregate
 
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
-import org.apache.spark.sql.execution.{UnsafeFixedWidthAggregationMap, SparkPlan}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.execution.SparkPlan
 
 /**
  * Utility functions used by the query planner to convert our plan to new aggregation code path.
  */
 object Utils {
-  def supportsTungstenAggregate(
-      groupingExpressions: Seq[Expression],
-      aggregateBufferAttributes: Seq[Attribute]): Boolean = {
-    val aggregationBufferSchema = StructType.fromAttributes(aggregateBufferAttributes)
-
-    UnsafeFixedWidthAggregationMap.supportsAggregationBufferSchema(aggregationBufferSchema) &&
-      UnsafeProjection.canSupport(groupingExpressions)
-  }
 
   def planAggregateWithoutPartial(
       groupingExpressions: Seq[NamedExpression],
@@ -70,8 +61,7 @@ object Utils {
     // Check if we can use TungstenAggregate.
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
-      aggregateExpressions.forall(_.aggregateFunction.isInstanceOf[DeclarativeAggregate]) &&
-      supportsTungstenAggregate(
+      TungstenAggregate.supportsAggregate(
         groupingExpressions,
         aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
@@ -94,6 +84,7 @@ object Utils {
         nonCompleteAggregateAttributes = partialAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
+        initialInputBufferOffset = 0,
         resultExpressions = partialResultExpressions,
         child = child)
     } else {
@@ -125,6 +116,7 @@ object Utils {
         nonCompleteAggregateAttributes = finalAggregateAttributes,
         completeAggregateExpressions = Nil,
         completeAggregateAttributes = Nil,
+        initialInputBufferOffset = groupingExpressions.length,
         resultExpressions = resultExpressions,
         child = partialAggregate)
     } else {
@@ -154,143 +146,150 @@ object Utils {
     val aggregateExpressions = functionsWithDistinct ++ functionsWithoutDistinct
     val usesTungstenAggregate =
       child.sqlContext.conf.unsafeEnabled &&
-        aggregateExpressions.forall(
-          _.aggregateFunction.isInstanceOf[DeclarativeAggregate]) &&
-        supportsTungstenAggregate(
+        TungstenAggregate.supportsAggregate(
           groupingExpressions,
           aggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes))
 
-    // 1. Create an Aggregate Operator for partial aggregations.
-    val groupingAttributes = groupingExpressions.map(_.toAttribute)
-
-    // It is safe to call head at here since functionsWithDistinct has at least one
-    // AggregateExpression2.
-    val distinctColumnExpressions =
-      functionsWithDistinct.head.aggregateFunction.children
-    val namedDistinctColumnExpressions = distinctColumnExpressions.map {
-      case ne: NamedExpression => ne -> ne
-      case other =>
-        val withAlias = Alias(other, other.toString)()
-        other -> withAlias
+    // functionsWithDistinct is guaranteed to be non-empty. Even though it may contain more than one
+    // DISTINCT aggregate function, all of those functions will have the same column expression.
+    // For example, it would be valid for functionsWithDistinct to be
+    // [COUNT(DISTINCT foo), MAX(DISTINCT foo)], but [COUNT(DISTINCT bar), COUNT(DISTINCT foo)] is
+    // disallowed because those two distinct aggregates have different column expressions.
+    val distinctColumnExpression: Expression = {
+      val allDistinctColumnExpressions = functionsWithDistinct.head.aggregateFunction.children
+      assert(allDistinctColumnExpressions.length == 1)
+      allDistinctColumnExpressions.head
+    }
+    val namedDistinctColumnExpression: NamedExpression = distinctColumnExpression match {
+      case ne: NamedExpression => ne
+      case other => Alias(other, other.toString)()
     }
-    val distinctColumnExpressionMap = namedDistinctColumnExpressions.toMap
-    val distinctColumnAttributes = namedDistinctColumnExpressions.map(_._2.toAttribute)
+    val distinctColumnAttribute: Attribute = namedDistinctColumnExpression.toAttribute
+    val groupingAttributes = groupingExpressions.map(_.toAttribute)
 
-    val partialAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Partial))
-    val partialAggregateAttributes =
-      partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-    val partialAggregateGroupingExpressions =
-      groupingExpressions ++ namedDistinctColumnExpressions.map(_._2)
-    val partialAggregateResult =
+    // 1. Create an Aggregate Operator for partial aggregations.
+    val partialAggregate: SparkPlan = {
+      val partialAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Partial))
+      val partialAggregateAttributes =
+        partialAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+      // We will group by the original grouping expression, plus an additional expression for the
+      // DISTINCT column. For example, for AVG(DISTINCT value) GROUP BY key, the grouping
+      // expressions will be [key, value].
+      val partialAggregateGroupingExpressions = groupingExpressions :+ namedDistinctColumnExpression
+      val partialAggregateResult =
         groupingAttributes ++
-        distinctColumnAttributes ++
-        partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
-    val partialAggregate = if (usesTungstenAggregate) {
-      TungstenAggregate(
-        requiredChildDistributionExpressions = None,
-        // The grouping expressions are original groupingExpressions and
-        // distinct columns. For example, for avg(distinct value) ... group by key
-        // the grouping expressions of this Aggregate Operator will be [key, value].
-        groupingExpressions = partialAggregateGroupingExpressions,
-        nonCompleteAggregateExpressions = partialAggregateExpressions,
-        nonCompleteAggregateAttributes = partialAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        resultExpressions = partialAggregateResult,
-        child = child)
-    } else {
-      SortBasedAggregate(
-        requiredChildDistributionExpressions = None,
-        groupingExpressions = partialAggregateGroupingExpressions,
-        nonCompleteAggregateExpressions = partialAggregateExpressions,
-        nonCompleteAggregateAttributes = partialAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        initialInputBufferOffset = 0,
-        resultExpressions = partialAggregateResult,
-        child = child)
+          Seq(distinctColumnAttribute) ++
+          partialAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
+      if (usesTungstenAggregate) {
+        TungstenAggregate(
+          requiredChildDistributionExpressions = None,
+          groupingExpressions = partialAggregateGroupingExpressions,
+          nonCompleteAggregateExpressions = partialAggregateExpressions,
+          nonCompleteAggregateAttributes = partialAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = 0,
+          resultExpressions = partialAggregateResult,
+          child = child)
+      } else {
+        SortBasedAggregate(
+          requiredChildDistributionExpressions = None,
+          groupingExpressions = partialAggregateGroupingExpressions,
+          nonCompleteAggregateExpressions = partialAggregateExpressions,
+          nonCompleteAggregateAttributes = partialAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = 0,
+          resultExpressions = partialAggregateResult,
+          child = child)
+      }
     }
 
     // 2. Create an Aggregate Operator for partial merge aggregations.
-    val partialMergeAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = PartialMerge))
-    val partialMergeAggregateAttributes =
-      partialMergeAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
-    val partialMergeAggregateResult =
+    val partialMergeAggregate: SparkPlan = {
+      val partialMergeAggregateExpressions =
+        functionsWithoutDistinct.map(_.copy(mode = PartialMerge))
+      val partialMergeAggregateAttributes =
+        partialMergeAggregateExpressions.flatMap(_.aggregateFunction.aggBufferAttributes)
+      val partialMergeAggregateResult =
         groupingAttributes ++
-        distinctColumnAttributes ++
-        partialMergeAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
-    val partialMergeAggregate = if (usesTungstenAggregate) {
-      TungstenAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
-        nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
-        nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        resultExpressions = partialMergeAggregateResult,
-        child = partialAggregate)
-    } else {
-      SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes ++ distinctColumnAttributes,
-        nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
-        nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
-        completeAggregateExpressions = Nil,
-        completeAggregateAttributes = Nil,
-        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
-        resultExpressions = partialMergeAggregateResult,
-        child = partialAggregate)
+          Seq(distinctColumnAttribute) ++
+          partialMergeAggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
+      if (usesTungstenAggregate) {
+        TungstenAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes :+ distinctColumnAttribute,
+          nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
+          nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = partialMergeAggregateResult,
+          child = partialAggregate)
+      } else {
+        SortBasedAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes :+ distinctColumnAttribute,
+          nonCompleteAggregateExpressions = partialMergeAggregateExpressions,
+          nonCompleteAggregateAttributes = partialMergeAggregateAttributes,
+          completeAggregateExpressions = Nil,
+          completeAggregateAttributes = Nil,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = partialMergeAggregateResult,
+          child = partialAggregate)
+      }
     }
 
-    // 3. Create an Aggregate Operator for partial merge aggregations.
-    val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
-    // The attributes of the final aggregation buffer, which is presented as input to the result
-    // projection:
-    val finalAggregateAttributes = finalAggregateExpressions.map {
-      expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
-    }
+    // 3. Create an Aggregate Operator for the final aggregation.
+    val finalAndCompleteAggregate: SparkPlan = {
+      val finalAggregateExpressions = functionsWithoutDistinct.map(_.copy(mode = Final))
+      // The attributes of the final aggregation buffer, which is presented as input to the result
+      // projection:
+      val finalAggregateAttributes = finalAggregateExpressions.map {
+        expr => aggregateFunctionToAttribute(expr.aggregateFunction, expr.isDistinct)
+      }
 
-    val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map {
-      // Children of an AggregateFunction with DISTINCT keyword has already
-      // been evaluated. At here, we need to replace original children
-      // to AttributeReferences.
-      case agg @ AggregateExpression2(aggregateFunction, mode, true) =>
-        val rewrittenAggregateFunction = aggregateFunction.transformDown {
-          case expr if distinctColumnExpressionMap.contains(expr) =>
-            distinctColumnExpressionMap(expr).toAttribute
-        }.asInstanceOf[AggregateFunction2]
-        // We rewrite the aggregate function to a non-distinct aggregation because
-        // its input will have distinct arguments.
-        // We just keep the isDistinct setting to true, so when users look at the query plan,
-        // they still can see distinct aggregations.
-        val rewrittenAggregateExpression =
-          AggregateExpression2(rewrittenAggregateFunction, Complete, true)
+      val (completeAggregateExpressions, completeAggregateAttributes) = functionsWithDistinct.map {
+        // Children of an AggregateFunction with DISTINCT keyword has already
+        // been evaluated. At here, we need to replace original children
+        // to AttributeReferences.
+        case agg @ AggregateExpression2(aggregateFunction, mode, true) =>
+          val rewrittenAggregateFunction = aggregateFunction.transformDown {
+            case expr if expr == distinctColumnExpression => distinctColumnAttribute
+          }.asInstanceOf[AggregateFunction2]
+          // We rewrite the aggregate function to a non-distinct aggregation because
+          // its input will have distinct arguments.
+          // We just keep the isDistinct setting to true, so when users look at the query plan,
+          // they still can see distinct aggregations.
+          val rewrittenAggregateExpression =
+            AggregateExpression2(rewrittenAggregateFunction, Complete, isDistinct = true)
 
-        val aggregateFunctionAttribute = aggregateFunctionToAttribute(agg.aggregateFunction, true)
-        (rewrittenAggregateExpression, aggregateFunctionAttribute)
-    }.unzip
-
-    val finalAndCompleteAggregate = if (usesTungstenAggregate) {
-      TungstenAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes,
-        nonCompleteAggregateExpressions = finalAggregateExpressions,
-        nonCompleteAggregateAttributes = finalAggregateAttributes,
-        completeAggregateExpressions = completeAggregateExpressions,
-        completeAggregateAttributes = completeAggregateAttributes,
-        resultExpressions = resultExpressions,
-        child = partialMergeAggregate)
-    } else {
-      SortBasedAggregate(
-        requiredChildDistributionExpressions = Some(groupingAttributes),
-        groupingExpressions = groupingAttributes,
-        nonCompleteAggregateExpressions = finalAggregateExpressions,
-        nonCompleteAggregateAttributes = finalAggregateAttributes,
-        completeAggregateExpressions = completeAggregateExpressions,
-        completeAggregateAttributes = completeAggregateAttributes,
-        initialInputBufferOffset = (groupingAttributes ++ distinctColumnAttributes).length,
-        resultExpressions = resultExpressions,
-        child = partialMergeAggregate)
+          val aggregateFunctionAttribute = aggregateFunctionToAttribute(agg.aggregateFunction, true)
+          (rewrittenAggregateExpression, aggregateFunctionAttribute)
+      }.unzip
+      if (usesTungstenAggregate) {
+        TungstenAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes,
+          nonCompleteAggregateExpressions = finalAggregateExpressions,
+          nonCompleteAggregateAttributes = finalAggregateAttributes,
+          completeAggregateExpressions = completeAggregateExpressions,
+          completeAggregateAttributes = completeAggregateAttributes,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = resultExpressions,
+          child = partialMergeAggregate)
+      } else {
+        SortBasedAggregate(
+          requiredChildDistributionExpressions = Some(groupingAttributes),
+          groupingExpressions = groupingAttributes,
+          nonCompleteAggregateExpressions = finalAggregateExpressions,
+          nonCompleteAggregateAttributes = finalAggregateAttributes,
+          completeAggregateExpressions = completeAggregateExpressions,
+          completeAggregateAttributes = completeAggregateAttributes,
+          initialInputBufferOffset = (groupingAttributes :+ distinctColumnAttribute).length,
+          resultExpressions = resultExpressions,
+          child = partialMergeAggregate)
+      }
     }
 
     finalAndCompleteAggregate :: Nil
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index ed974b3a53d41..0cc4988ff681c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -39,7 +39,7 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
       }
       val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
       iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, Seq.empty, Seq.empty,
-        Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
+        0, Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)
     } finally {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index 18bbdb9908142..a2ebf6552fd06 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -553,10 +553,16 @@ private[hive] case class HiveGenericUDTF(
 private[hive] case class HiveUDAFFunction(
     funcWrapper: HiveFunctionWrapper,
     children: Seq[Expression],
-    isUDAFBridgeRequired: Boolean = false)
+    isUDAFBridgeRequired: Boolean = false,
+    mutableAggBufferOffset: Int = 0,
+    inputAggBufferOffset: Int = 0)
   extends ImperativeAggregate with HiveInspectors {
 
-  def this() = this(null, null)
+  override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate =
+    copy(mutableAggBufferOffset = newMutableAggBufferOffset)
+
+  override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate =
+    copy(inputAggBufferOffset = newInputAggBufferOffset)
 
   @transient
   private lazy val resolver =
@@ -614,7 +620,11 @@ private[hive] case class HiveUDAFFunction(
     buffer = function.getNewAggregationBuffer
   }
 
-  override def aggBufferAttributes: Seq[AttributeReference] = Nil
+  override val aggBufferAttributes: Seq[AttributeReference] = Nil
+
+  // Note: although this simply copies aggBufferAttributes, this common code can not be placed
+  // in the superclass because that will lead to initialization ordering issues.
+  override val inputAggBufferAttributes: Seq[AttributeReference] = Nil
 
   // We rely on Hive to check the input data types, so use `AnyDataType` here to bypass our
   // catalyst type checking framework.

From 9808052b5adfed7dafd6c1b3971b998e45b2799a Mon Sep 17 00:00:00 2001
From: Cheng Hao <hao.cheng@intel.com>
Date: Wed, 14 Oct 2015 20:56:08 -0700
Subject: [PATCH 060/139] [SPARK-11076] [SQL] Add decimal support for floor and
 ceil

Actually all of the `UnaryMathExpression` doens't support the Decimal, will create follow ups for supporing it. This is the first PR which will be good to review the approach I am taking.

Author: Cheng Hao <hao.cheng@intel.com>

Closes #9086 from chenghao-intel/ceiling.
---
 .../expressions/mathExpressions.scala         | 48 +++++++++++++++----
 .../org/apache/spark/sql/types/Decimal.scala  | 32 +++++++++++--
 .../expressions/LiteralGenerator.scala        | 14 +++++-
 .../expressions/MathFunctionsSuite.scala      | 10 ++++
 4 files changed, 91 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index a8164e9e29ec6..28f616fbb9ca5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -55,7 +55,7 @@ abstract class LeafMathExpression(c: Double, name: String)
 abstract class UnaryMathExpression(val f: Double => Double, name: String)
   extends UnaryExpression with Serializable with ImplicitCastInputTypes {
 
-  override def inputTypes: Seq[DataType] = Seq(DoubleType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(DoubleType)
   override def dataType: DataType = DoubleType
   override def nullable: Boolean = true
   override def toString: String = s"$name($child)"
@@ -153,13 +153,28 @@ case class Atan(child: Expression) extends UnaryMathExpression(math.atan, "ATAN"
 case class Cbrt(child: Expression) extends UnaryMathExpression(math.cbrt, "CBRT")
 
 case class Ceil(child: Expression) extends UnaryMathExpression(math.ceil, "CEIL") {
-  override def dataType: DataType = LongType
-  protected override def nullSafeEval(input: Any): Any = {
-    f(input.asInstanceOf[Double]).toLong
+  override def dataType: DataType = child.dataType match {
+    case dt @ DecimalType.Fixed(_, 0) => dt
+    case DecimalType.Fixed(precision, scale) =>
+      DecimalType.bounded(precision - scale + 1, 0)
+    case _ => LongType
+  }
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(DoubleType, DecimalType))
+
+  protected override def nullSafeEval(input: Any): Any = child.dataType match {
+    case DoubleType => f(input.asInstanceOf[Double]).toLong
+    case DecimalType.Fixed(precision, scale) => input.asInstanceOf[Decimal].ceil
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    child.dataType match {
+      case DecimalType.Fixed(_, 0) => defineCodeGen(ctx, ev, c => s"$c")
+      case DecimalType.Fixed(precision, scale) =>
+        defineCodeGen(ctx, ev, c => s"$c.ceil()")
+      case _ => defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    }
   }
 }
 
@@ -205,13 +220,28 @@ case class Exp(child: Expression) extends UnaryMathExpression(math.exp, "EXP")
 case class Expm1(child: Expression) extends UnaryMathExpression(math.expm1, "EXPM1")
 
 case class Floor(child: Expression) extends UnaryMathExpression(math.floor, "FLOOR") {
-  override def dataType: DataType = LongType
-  protected override def nullSafeEval(input: Any): Any = {
-    f(input.asInstanceOf[Double]).toLong
+  override def dataType: DataType = child.dataType match {
+    case dt @ DecimalType.Fixed(_, 0) => dt
+    case DecimalType.Fixed(precision, scale) =>
+      DecimalType.bounded(precision - scale + 1, 0)
+    case _ => LongType
+  }
+
+  override def inputTypes: Seq[AbstractDataType] =
+    Seq(TypeCollection(DoubleType, DecimalType))
+
+  protected override def nullSafeEval(input: Any): Any = child.dataType match {
+    case DoubleType => f(input.asInstanceOf[Double]).toLong
+    case DecimalType.Fixed(precision, scale) => input.asInstanceOf[Decimal].floor
   }
 
   override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
-    defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    child.dataType match {
+      case DecimalType.Fixed(_, 0) => defineCodeGen(ctx, ev, c => s"$c")
+      case DecimalType.Fixed(precision, scale) =>
+        defineCodeGen(ctx, ev, c => s"$c.floor()")
+      case _ => defineCodeGen(ctx, ev, c => s"(long)(java.lang.Math.${funcName}($c))")
+    }
   }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index c11dab35cdf6f..c7a1a2e7469ee 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -107,7 +107,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    * Set this Decimal to the given BigDecimal value, with a given precision and scale.
    */
   def set(decimal: BigDecimal, precision: Int, scale: Int): Decimal = {
-    this.decimalVal = decimal.setScale(scale, ROUNDING_MODE)
+    this.decimalVal = decimal.setScale(scale, ROUND_HALF_UP)
     require(
       decimalVal.precision <= precision,
       s"Decimal precision ${decimalVal.precision} exceeds max precision $precision")
@@ -198,6 +198,16 @@ final class Decimal extends Ordered[Decimal] with Serializable {
    * @return true if successful, false if overflow would occur
    */
   def changePrecision(precision: Int, scale: Int): Boolean = {
+    changePrecision(precision, scale, ROUND_HALF_UP)
+  }
+
+  /**
+   * Update precision and scale while keeping our value the same, and return true if successful.
+   *
+   * @return true if successful, false if overflow would occur
+   */
+  private[sql] def changePrecision(precision: Int, scale: Int,
+                      roundMode: BigDecimal.RoundingMode.Value): Boolean = {
     // fast path for UnsafeProjection
     if (precision == this.precision && scale == this.scale) {
       return true
@@ -231,7 +241,7 @@ final class Decimal extends Ordered[Decimal] with Serializable {
     if (decimalVal.ne(null)) {
       // We get here if either we started with a BigDecimal, or we switched to one because we would
       // have overflowed our Long; in either case we must rescale decimalVal to the new scale.
-      val newVal = decimalVal.setScale(scale, ROUNDING_MODE)
+      val newVal = decimalVal.setScale(scale, roundMode)
       if (newVal.precision > precision) {
         return false
       }
@@ -309,10 +319,26 @@ final class Decimal extends Ordered[Decimal] with Serializable {
   }
 
   def abs: Decimal = if (this.compare(Decimal.ZERO) < 0) this.unary_- else this
+
+  def floor: Decimal = if (scale == 0) this else {
+    val value = this.clone()
+    value.changePrecision(
+      DecimalType.bounded(precision - scale + 1, 0).precision, 0, ROUND_FLOOR)
+    value
+  }
+
+  def ceil: Decimal = if (scale == 0) this else {
+    val value = this.clone()
+    value.changePrecision(
+      DecimalType.bounded(precision - scale + 1, 0).precision, 0, ROUND_CEILING)
+    value
+  }
 }
 
 object Decimal {
-  private val ROUNDING_MODE = BigDecimal.RoundingMode.HALF_UP
+  val ROUND_HALF_UP = BigDecimal.RoundingMode.HALF_UP
+  val ROUND_CEILING = BigDecimal.RoundingMode.CEILING
+  val ROUND_FLOOR = BigDecimal.RoundingMode.FLOOR
 
   /** Maximum number of decimal digits a Long can represent */
   val MAX_LONG_DIGITS = 18
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala
index ee6d25157fc08..d9c91415e249d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralGenerator.scala
@@ -78,7 +78,18 @@ object LiteralGenerator {
         Double.NaN, Double.PositiveInfinity, Double.NegativeInfinity)
     } yield Literal.create(f, DoubleType)
 
-  // TODO: decimal type
+  // TODO cache the generated data
+  def decimalLiteralGen(precision: Int, scale: Int): Gen[Literal] = {
+    assert(scale >= 0)
+    assert(precision >= scale)
+    Arbitrary.arbBigInt.arbitrary.map { s =>
+      val a = (s % BigInt(10).pow(precision - scale)).toString()
+      val b = (s % BigInt(10).pow(scale)).abs.toString()
+      Literal.create(
+        Decimal(BigDecimal(s"$a.$b"), precision, scale),
+        DecimalType(precision, scale))
+    }
+  }
 
   lazy val stringLiteralGen: Gen[Literal] =
     for { s <- Arbitrary.arbString.arbitrary } yield Literal.create(s, StringType)
@@ -122,6 +133,7 @@ object LiteralGenerator {
       case StringType => stringLiteralGen
       case BinaryType => binaryLiteralGen
       case CalendarIntervalType => calendarIntervalLiterGen
+      case DecimalType.Fixed(precision, scale) => decimalLiteralGen(precision, scale)
       case dt => throw new IllegalArgumentException(s"not supported type $dt")
     }
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
index 1b2a9163a3d09..88ed9fdd6465f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathFunctionsSuite.scala
@@ -246,11 +246,21 @@ class MathFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("ceil") {
     testUnary(Ceil, (d: Double) => math.ceil(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Ceil, DoubleType)
+
+    testUnary(Ceil, (d: Decimal) => d.ceil, (-20 to 20).map(x => Decimal(x * 0.1)))
+    checkConsistencyBetweenInterpretedAndCodegen(Ceil, DecimalType(25, 3))
+    checkConsistencyBetweenInterpretedAndCodegen(Ceil, DecimalType(25, 0))
+    checkConsistencyBetweenInterpretedAndCodegen(Ceil, DecimalType(5, 0))
   }
 
   test("floor") {
     testUnary(Floor, (d: Double) => math.floor(d).toLong)
     checkConsistencyBetweenInterpretedAndCodegen(Floor, DoubleType)
+
+    testUnary(Floor, (d: Decimal) => d.floor, (-20 to 20).map(x => Decimal(x * 0.1)))
+    checkConsistencyBetweenInterpretedAndCodegen(Floor, DecimalType(25, 3))
+    checkConsistencyBetweenInterpretedAndCodegen(Floor, DecimalType(25, 0))
+    checkConsistencyBetweenInterpretedAndCodegen(Floor, DecimalType(5, 0))
   }
 
   test("factorial") {

From 0f62c2282bb30cb4fb6eea9d28b198d557a79b22 Mon Sep 17 00:00:00 2001
From: Adam Lewandowski <alewandowski@ipcoop.com>
Date: Thu, 15 Oct 2015 09:45:54 -0700
Subject: [PATCH 061/139] [SPARK-11093] [CORE]
 ChildFirstURLClassLoader#getResources should return all found resources, not
 just those in the child classloader

Author: Adam Lewandowski <alewandowski@ipcoop.com>

Closes #9106 from alewando/childFirstFix.
---
 .../spark/util/MutableURLClassLoader.scala    | 13 +++---
 .../util/MutableURLClassLoaderSuite.scala     | 40 ++++++++++++++++++-
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
index a1c33212cdb2b..945217203be72 100644
--- a/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
+++ b/core/src/main/scala/org/apache/spark/util/MutableURLClassLoader.scala
@@ -21,6 +21,8 @@ import java.net.{URLClassLoader, URL}
 import java.util.Enumeration
 import java.util.concurrent.ConcurrentHashMap
 
+import scala.collection.JavaConverters._
+
 /**
  * URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
  */
@@ -82,14 +84,9 @@ private[spark] class ChildFirstURLClassLoader(urls: Array[URL], parent: ClassLoa
   }
 
   override def getResources(name: String): Enumeration[URL] = {
-    val urls = super.findResources(name)
-    val res =
-      if (urls != null && urls.hasMoreElements()) {
-        urls
-      } else {
-        parentClassLoader.getResources(name)
-      }
-    res
+    val childUrls = super.findResources(name).asScala
+    val parentUrls = parentClassLoader.getResources(name).asScala
+    (childUrls ++ parentUrls).asJavaEnumeration
   }
 
   override def addURL(url: URL) {
diff --git a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
index d3d464e84ffd7..8b53d4f14a6a4 100644
--- a/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/MutableURLClassLoaderSuite.scala
@@ -19,9 +19,14 @@ package org.apache.spark.util
 
 import java.net.URLClassLoader
 
+import scala.collection.JavaConverters._
+
+import org.scalatest.Matchers
+import org.scalatest.Matchers._
+
 import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TestUtils}
 
-class MutableURLClassLoaderSuite extends SparkFunSuite {
+class MutableURLClassLoaderSuite extends SparkFunSuite with Matchers {
 
   val urls2 = List(TestUtils.createJarWithClasses(
       classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
@@ -32,6 +37,12 @@ class MutableURLClassLoaderSuite extends SparkFunSuite {
       toStringValue = "1",
       classpathUrls = urls2)).toArray
 
+  val fileUrlsChild = List(TestUtils.createJarWithFiles(Map(
+    "resource1" -> "resource1Contents-child",
+    "resource2" -> "resource2Contents"))).toArray
+  val fileUrlsParent = List(TestUtils.createJarWithFiles(Map(
+    "resource1" -> "resource1Contents-parent"))).toArray
+
   test("child first") {
     val parentLoader = new URLClassLoader(urls2, null)
     val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
@@ -68,6 +79,33 @@ class MutableURLClassLoaderSuite extends SparkFunSuite {
     }
   }
 
+  test("default JDK classloader get resources") {
+    val parentLoader = new URLClassLoader(fileUrlsParent, null)
+    val classLoader = new URLClassLoader(fileUrlsChild, parentLoader)
+    assert(classLoader.getResources("resource1").asScala.size === 2)
+    assert(classLoader.getResources("resource2").asScala.size === 1)
+  }
+
+  test("parent first get resources") {
+    val parentLoader = new URLClassLoader(fileUrlsParent, null)
+    val classLoader = new MutableURLClassLoader(fileUrlsChild, parentLoader)
+    assert(classLoader.getResources("resource1").asScala.size === 2)
+    assert(classLoader.getResources("resource2").asScala.size === 1)
+  }
+
+  test("child first get resources") {
+    val parentLoader = new URLClassLoader(fileUrlsParent, null)
+    val classLoader = new ChildFirstURLClassLoader(fileUrlsChild, parentLoader)
+
+    val res1 = classLoader.getResources("resource1").asScala.toList
+    assert(res1.size === 2)
+    assert(classLoader.getResources("resource2").asScala.size === 1)
+
+    res1.map(scala.io.Source.fromURL(_).mkString) should contain inOrderOnly
+      ("resource1Contents-child", "resource1Contents-parent")
+  }
+
+
   test("driver sets context class loader in local mode") {
     // Test the case where the driver program sets a context classloader and then runs a job
     // in local mode. This is what happens when ./spark-submit is called with "local" as the

From aec4400beffc569c13cceea2d0c481dfa3f34175 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Thu, 15 Oct 2015 09:49:19 -0700
Subject: [PATCH 062/139] =?UTF-8?q?[SPARK-11099]=20[SPARK=20SHELL]=20[SPAR?=
 =?UTF-8?q?K=20SUBMIT]=20Default=20conf=20property=20file=20i=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Please help review it. Thanks

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9114 from zjffdu/SPARK-11099.
---
 .../launcher/AbstractCommandBuilder.java      | 14 ++++------
 .../SparkSubmitCommandBuilderSuite.java       | 28 +++++++++++++------
 .../src/test/resources/spark-defaults.conf    | 21 ++++++++++++++
 3 files changed, 45 insertions(+), 18 deletions(-)
 create mode 100644 launcher/src/test/resources/spark-defaults.conf

diff --git a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
index cf3729b7febc3..3ee6bd92e47fc 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/AbstractCommandBuilder.java
@@ -272,15 +272,11 @@ void setPropertiesFile(String path) {
 
   Map<String, String> getEffectiveConfig() throws IOException {
     if (effectiveConfig == null) {
-      if (propertiesFile == null) {
-        effectiveConfig = conf;
-      } else {
-        effectiveConfig = new HashMap<>(conf);
-        Properties p = loadPropertiesFile();
-        for (String key : p.stringPropertyNames()) {
-          if (!effectiveConfig.containsKey(key)) {
-            effectiveConfig.put(key, p.getProperty(key));
-          }
+      effectiveConfig = new HashMap<>(conf);
+      Properties p = loadPropertiesFile();
+      for (String key : p.stringPropertyNames()) {
+        if (!effectiveConfig.containsKey(key)) {
+          effectiveConfig.put(key, p.getProperty(key));
         }
       }
     }
diff --git a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
index d5397b0685046..6aad47adbcc82 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/SparkSubmitCommandBuilderSuite.java
@@ -48,12 +48,14 @@ public static void cleanUp() throws Exception {
 
   @Test
   public void testDriverCmdBuilder() throws Exception {
-    testCmdBuilder(true);
+    testCmdBuilder(true, true);
+    testCmdBuilder(true, false);
   }
 
   @Test
   public void testClusterCmdBuilder() throws Exception {
-    testCmdBuilder(false);
+    testCmdBuilder(false, true);
+    testCmdBuilder(false, false);
   }
 
   @Test
@@ -149,7 +151,7 @@ public void testPySparkFallback() throws Exception {
     assertEquals("arg1", cmd.get(cmd.size() - 1));
   }
 
-  private void testCmdBuilder(boolean isDriver) throws Exception {
+  private void testCmdBuilder(boolean isDriver, boolean useDefaultPropertyFile) throws Exception {
     String deployMode = isDriver ? "client" : "cluster";
 
     SparkSubmitCommandBuilder launcher =
@@ -161,14 +163,20 @@ private void testCmdBuilder(boolean isDriver) throws Exception {
     launcher.appResource = "/foo";
     launcher.appName = "MyApp";
     launcher.mainClass = "my.Class";
-    launcher.setPropertiesFile(dummyPropsFile.getAbsolutePath());
     launcher.appArgs.add("foo");
     launcher.appArgs.add("bar");
-    launcher.conf.put(SparkLauncher.DRIVER_MEMORY, "1g");
-    launcher.conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, "/driver");
-    launcher.conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "-Ddriver -XX:MaxPermSize=256m");
-    launcher.conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, "/native");
     launcher.conf.put("spark.foo", "foo");
+    // either set the property through "--conf" or through default property file
+    if (!useDefaultPropertyFile) {
+      launcher.setPropertiesFile(dummyPropsFile.getAbsolutePath());
+      launcher.conf.put(SparkLauncher.DRIVER_MEMORY, "1g");
+      launcher.conf.put(SparkLauncher.DRIVER_EXTRA_CLASSPATH, "/driver");
+      launcher.conf.put(SparkLauncher.DRIVER_EXTRA_JAVA_OPTIONS, "-Ddriver -XX:MaxPermSize=256m");
+      launcher.conf.put(SparkLauncher.DRIVER_EXTRA_LIBRARY_PATH, "/native");
+    } else {
+      launcher.childEnv.put("SPARK_CONF_DIR", System.getProperty("spark.test.home")
+          + "/launcher/src/test/resources");
+    }
 
     Map<String, String> env = new HashMap<String, String>();
     List<String> cmd = launcher.buildCommand(env);
@@ -216,7 +224,9 @@ private void testCmdBuilder(boolean isDriver) throws Exception {
     }
 
     // Checks below are the same for both driver and non-driver mode.
-    assertEquals(dummyPropsFile.getAbsolutePath(), findArgValue(cmd, parser.PROPERTIES_FILE));
+    if (!useDefaultPropertyFile) {
+      assertEquals(dummyPropsFile.getAbsolutePath(), findArgValue(cmd, parser.PROPERTIES_FILE));
+    }
     assertEquals("yarn", findArgValue(cmd, parser.MASTER));
     assertEquals(deployMode, findArgValue(cmd, parser.DEPLOY_MODE));
     assertEquals("my.Class", findArgValue(cmd, parser.CLASS));
diff --git a/launcher/src/test/resources/spark-defaults.conf b/launcher/src/test/resources/spark-defaults.conf
new file mode 100644
index 0000000000000..239fc57883e98
--- /dev/null
+++ b/launcher/src/test/resources/spark-defaults.conf
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+spark.driver.memory=1g
+spark.driver.extraClassPath=/driver
+spark.driver.extraJavaOptions=-Ddriver -XX:MaxPermSize=256m
+spark.driver.extraLibraryPath=/native
\ No newline at end of file

From 523adc24a683930304f408d477607edfe9de7b76 Mon Sep 17 00:00:00 2001
From: shellberg <sah@zepler.org>
Date: Thu, 15 Oct 2015 18:07:10 +0100
Subject: [PATCH 063/139] [SPARK-11066] Update DAGScheduler's "misbehaved
 ResultHandler"

Restrict tasks (of job) to only 1 to ensure that the causing Exception asserted for job failure is the deliberately thrown DAGSchedulerSuiteDummyException intended, not an UnsupportedOperationException from any second/subsequent tasks that can propagate from a race condition during code execution.

Author: shellberg <sah@zepler.org>

Closes #9076 from shellberg/shellberg-DAGSchedulerSuite-misbehavedResultHandlerTest-patch-1.
---
 .../apache/spark/scheduler/DAGSchedulerSuite.scala  | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 697c195e4ad1f..5b01ddb298c39 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -1375,18 +1375,27 @@ class DAGSchedulerSuite
     assert(sc.parallelize(1 to 10, 2).count() === 10)
   }
 
+  /**
+   * The job will be failed on first task throwing a DAGSchedulerSuiteDummyException.
+   *  Any subsequent task WILL throw a legitimate java.lang.UnsupportedOperationException.
+   *  If multiple tasks, there exists a race condition between the SparkDriverExecutionExceptions
+   *  and their differing causes as to which will represent result for job...
+   */
   test("misbehaved resultHandler should not crash DAGScheduler and SparkContext") {
     val e = intercept[SparkDriverExecutionException] {
+      // Number of parallelized partitions implies number of tasks of job
       val rdd = sc.parallelize(1 to 10, 2)
       sc.runJob[Int, Int](
         rdd,
         (context: TaskContext, iter: Iterator[Int]) => iter.size,
-        Seq(0, 1),
+        // For a robust test assertion, limit number of job tasks to 1; that is,
+        // if multiple RDD partitions, use id of any one partition, say, first partition id=0
+        Seq(0),
         (part: Int, result: Int) => throw new DAGSchedulerSuiteDummyException)
     }
     assert(e.getCause.isInstanceOf[DAGSchedulerSuiteDummyException])
 
-    // Make sure we can still run commands
+    // Make sure we can still run commands on our SparkContext
     assert(sc.parallelize(1 to 10, 2).count() === 10)
   }
 

From d45a0d3ca23df86cf0a95508ccc3b4b98f1b611c Mon Sep 17 00:00:00 2001
From: Carson Wang <carson.wang@intel.com>
Date: Thu, 15 Oct 2015 10:36:54 -0700
Subject: [PATCH 064/139] [SPARK-11047] Internal accumulators miss the internal
 flag when replaying events in the history server

Internal accumulators don't write the internal flag to event log. So on the history server Web UI, all accumulators are not internal. This causes incorrect peak execution memory and unwanted accumulator table displayed on the stage page.
To fix it, I add the "internal" property of AccumulableInfo when writing the event log.

Author: Carson Wang <carson.wang@intel.com>

Closes #9061 from carsonwang/accumulableBug.
---
 .../spark/scheduler/AccumulableInfo.scala     |  9 ++
 .../org/apache/spark/util/JsonProtocol.scala  |  6 +-
 .../apache/spark/util/JsonProtocolSuite.scala | 96 +++++++++++++------
 3 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
index b6bff64ee368e..146cfb9ba8037 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/AccumulableInfo.scala
@@ -46,6 +46,15 @@ class AccumulableInfo private[spark] (
 }
 
 object AccumulableInfo {
+  def apply(
+      id: Long,
+      name: String,
+      update: Option[String],
+      value: String,
+      internal: Boolean): AccumulableInfo = {
+    new AccumulableInfo(id, name, update, value, internal)
+  }
+
   def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = {
     new AccumulableInfo(id, name, update, value, internal = false)
   }
diff --git a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
index 40729fa5a4ffe..a06dc6f709d33 100644
--- a/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/util/JsonProtocol.scala
@@ -282,7 +282,8 @@ private[spark] object JsonProtocol {
     ("ID" -> accumulableInfo.id) ~
     ("Name" -> accumulableInfo.name) ~
     ("Update" -> accumulableInfo.update.map(new JString(_)).getOrElse(JNothing)) ~
-    ("Value" -> accumulableInfo.value)
+    ("Value" -> accumulableInfo.value) ~
+    ("Internal" -> accumulableInfo.internal)
   }
 
   def taskMetricsToJson(taskMetrics: TaskMetrics): JValue = {
@@ -696,7 +697,8 @@ private[spark] object JsonProtocol {
     val name = (json \ "Name").extract[String]
     val update = Utils.jsonOption(json \ "Update").map(_.extract[String])
     val value = (json \ "Value").extract[String]
-    AccumulableInfo(id, name, update, value)
+    val internal = (json \ "Internal").extractOpt[Boolean].getOrElse(false)
+    AccumulableInfo(id, name, update, value, internal)
   }
 
   def taskMetricsFromJson(json: JValue): TaskMetrics = {
diff --git a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
index a24bf2931cca0..f9572921f43cb 100644
--- a/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/JsonProtocolSuite.scala
@@ -364,6 +364,15 @@ class JsonProtocolSuite extends SparkFunSuite {
     assertEquals(expectedDenied, JsonProtocol.taskEndReasonFromJson(oldDenied))
   }
 
+  test("AccumulableInfo backward compatibility") {
+    // "Internal" property of AccumulableInfo were added after 1.5.1.
+    val accumulableInfo = makeAccumulableInfo(1)
+    val oldJson = JsonProtocol.accumulableInfoToJson(accumulableInfo)
+      .removeField({ _._1 == "Internal" })
+    val oldInfo = JsonProtocol.accumulableInfoFromJson(oldJson)
+    assert(false === oldInfo.internal)
+  }
+
   /** -------------------------- *
    | Helper test running methods |
    * --------------------------- */
@@ -723,15 +732,15 @@ class JsonProtocolSuite extends SparkFunSuite {
     val taskInfo = new TaskInfo(a, b, c, d, "executor", "your kind sir", TaskLocality.NODE_LOCAL,
       speculative)
     val (acc1, acc2, acc3) =
-      (makeAccumulableInfo(1), makeAccumulableInfo(2), makeAccumulableInfo(3))
+      (makeAccumulableInfo(1), makeAccumulableInfo(2), makeAccumulableInfo(3, internal = true))
     taskInfo.accumulables += acc1
     taskInfo.accumulables += acc2
     taskInfo.accumulables += acc3
     taskInfo
   }
 
-  private def makeAccumulableInfo(id: Int): AccumulableInfo =
-    AccumulableInfo(id, " Accumulable " + id, Some("delta" + id), "val" + id)
+  private def makeAccumulableInfo(id: Int, internal: Boolean = false): AccumulableInfo =
+    AccumulableInfo(id, " Accumulable " + id, Some("delta" + id), "val" + id, internal)
 
   /**
    * Creates a TaskMetrics object describing a task that read data from Hadoop (if hasHadoopInput is
@@ -812,13 +821,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      }
       |    ]
       |  },
@@ -866,13 +877,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      }
       |    ]
       |  }
@@ -902,19 +915,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  }
@@ -942,19 +958,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  }
@@ -988,19 +1007,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  },
@@ -1074,19 +1096,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  },
@@ -1157,19 +1182,22 @@ class JsonProtocolSuite extends SparkFunSuite {
       |        "ID": 1,
       |        "Name": "Accumulable1",
       |        "Update": "delta1",
-      |        "Value": "val1"
+      |        "Value": "val1",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 2,
       |        "Name": "Accumulable2",
       |        "Update": "delta2",
-      |        "Value": "val2"
+      |        "Value": "val2",
+      |        "Internal": false
       |      },
       |      {
       |        "ID": 3,
       |        "Name": "Accumulable3",
       |        "Update": "delta3",
-      |        "Value": "val3"
+      |        "Value": "val3",
+      |        "Internal": true
       |      }
       |    ]
       |  },
@@ -1251,13 +1279,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    },
@@ -1309,13 +1339,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    },
@@ -1384,13 +1416,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    },
@@ -1476,13 +1510,15 @@ class JsonProtocolSuite extends SparkFunSuite {
       |          "ID": 2,
       |          "Name": " Accumulable 2",
       |          "Update": "delta2",
-      |          "Value": "val2"
+      |          "Value": "val2",
+      |          "Internal": false
       |        },
       |        {
       |          "ID": 1,
       |          "Name": " Accumulable 1",
       |          "Update": "delta1",
-      |          "Value": "val1"
+      |          "Value": "val1",
+      |          "Internal": false
       |        }
       |      ]
       |    }

From b591de7c07ba8e71092f71e34001520bec995a8a Mon Sep 17 00:00:00 2001
From: Nick Pritchard <nicholas.pritchard@falkonry.com>
Date: Thu, 15 Oct 2015 12:45:37 -0700
Subject: [PATCH 065/139] [SPARK-11039][Documentation][Web UI] Document
 additional ui configurations

Add documentation for configuration:
- spark.sql.ui.retainedExecutions
- spark.streaming.ui.retainedBatches

Author: Nick Pritchard <nicholas.pritchard@falkonry.com>

Closes #9052 from pnpritchard/SPARK-11039.
---
 docs/configuration.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/docs/configuration.md b/docs/configuration.md
index 771d93be04b06..46d92ceb762d6 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -554,6 +554,20 @@ Apart from these, the following properties are also available, and may be useful
     How many finished drivers the Spark UI and status APIs remember before garbage collecting.
   </td>
 </tr>
+<tr>
+  <td><code>spark.sql.ui.retainedExecutions</code></td>
+  <td>1000</td>
+  <td>
+    How many finished executions the Spark UI and status APIs remember before garbage collecting.
+  </td>
+</tr>
+<tr>
+  <td><code>spark.streaming.ui.retainedBatches</code></td>
+  <td>1000</td>
+  <td>
+    How many finished batches the Spark UI and status APIs remember before garbage collecting.
+  </td>
+</tr>
 </table>
 
 #### Compression and Serialization

From a5719804c5ed99ce36bd0dd230ab8b3b7a3b92e3 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Thu, 15 Oct 2015 14:46:40 -0700
Subject: [PATCH 066/139] [SPARK-11071] [LAUNCHER] Fix flakiness in
 LauncherServerSuite::timeout.

The test could fail depending on scheduling of the various threads
involved; the change removes some sources of races, while making the
test a little more resilient by trying a few times before giving up.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9079 from vanzin/SPARK-11071.
---
 .../apache/spark/launcher/LauncherServer.java |  9 ++++-
 .../spark/launcher/LauncherServerSuite.java   | 35 ++++++++++++++-----
 2 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
index c5fd40816d62f..d099ee9aa9dae 100644
--- a/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
+++ b/launcher/src/main/java/org/apache/spark/launcher/LauncherServer.java
@@ -242,7 +242,14 @@ public void run() {
           synchronized (clients) {
             clients.add(clientConnection);
           }
-          timeoutTimer.schedule(timeout, getConnectionTimeout());
+          long timeoutMs = getConnectionTimeout();
+          // 0 is used for testing to avoid issues with clock resolution / thread scheduling,
+          // and force an immediate timeout.
+          if (timeoutMs > 0) {
+            timeoutTimer.schedule(timeout, getConnectionTimeout());
+          } else {
+            timeout.run();
+          }
         }
       }
     } catch (IOException ioe) {
diff --git a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
index 27cd1061a15b3..dc8fbb58d880b 100644
--- a/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
+++ b/launcher/src/test/java/org/apache/spark/launcher/LauncherServerSuite.java
@@ -121,12 +121,12 @@ private void wakeUp() {
 
   @Test
   public void testTimeout() throws Exception {
-    final long TEST_TIMEOUT = 10L;
-
     ChildProcAppHandle handle = null;
     TestClient client = null;
     try {
-      SparkLauncher.setConfig(SparkLauncher.CHILD_CONNECTION_TIMEOUT, String.valueOf(TEST_TIMEOUT));
+      // LauncherServer will immediately close the server-side socket when the timeout is set
+      // to 0.
+      SparkLauncher.setConfig(SparkLauncher.CHILD_CONNECTION_TIMEOUT, "0");
 
       handle = LauncherServer.newAppHandle();
 
@@ -134,12 +134,29 @@ public void testTimeout() throws Exception {
         LauncherServer.getServerInstance().getPort());
       client = new TestClient(s);
 
-      Thread.sleep(TEST_TIMEOUT * 10);
-      try {
-        client.send(new Hello(handle.getSecret(), "1.4.0"));
-        fail("Expected exception caused by connection timeout.");
-      } catch (IllegalStateException e) {
-        // Expected.
+      // Try a few times since the client-side socket may not reflect the server-side close
+      // immediately.
+      boolean helloSent = false;
+      int maxTries = 10;
+      for (int i = 0; i < maxTries; i++) {
+        try {
+          if (!helloSent) {
+            client.send(new Hello(handle.getSecret(), "1.4.0"));
+            helloSent = true;
+          } else {
+            client.send(new SetAppId("appId"));
+          }
+          fail("Expected exception caused by connection timeout.");
+        } catch (IllegalStateException | IOException e) {
+          // Expected.
+          break;
+        } catch (AssertionError e) {
+          if (i < maxTries - 1) {
+            Thread.sleep(100);
+          } else {
+            throw new AssertionError("Test failed after " + maxTries + " attempts.", e);
+          }
+        }
       }
     } finally {
       SparkLauncher.launcherConfig.remove(SparkLauncher.CHILD_CONNECTION_TIMEOUT);

From 723aa75a9d566c698aa49597f4f655396fef77bd Mon Sep 17 00:00:00 2001
From: Britta Weber <britta.weber@elasticsearch.com>
Date: Thu, 15 Oct 2015 14:47:11 -0700
Subject: [PATCH 067/139] fix typo bellow -> below

Author: Britta Weber <britta.weber@elasticsearch.com>

Closes #9136 from brwe/typo-bellow.
---
 docs/mllib-collaborative-filtering.md | 2 +-
 docs/mllib-linear-methods.md          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/mllib-collaborative-filtering.md b/docs/mllib-collaborative-filtering.md
index b3fd51dca5c90..1ad52123c74aa 100644
--- a/docs/mllib-collaborative-filtering.md
+++ b/docs/mllib-collaborative-filtering.md
@@ -119,7 +119,7 @@ All of MLlib's methods use Java-friendly types, so you can import and call them
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
-that is equivalent to the provided example in Scala is given bellow:
+that is equivalent to the provided example in Scala is given below:
 
 Refer to the [`ALS` Java docs](api/java/org/apache/spark/mllib/recommendation/ALS.html) for details on the API.
 
diff --git a/docs/mllib-linear-methods.md b/docs/mllib-linear-methods.md
index a3e1620c778ff..0c76e6e999465 100644
--- a/docs/mllib-linear-methods.md
+++ b/docs/mllib-linear-methods.md
@@ -230,7 +230,7 @@ All of MLlib's methods use Java-friendly types, so you can import and call them
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
 calling `.rdd()` on your `JavaRDD` object. A self-contained application example
-that is equivalent to the provided example in Scala is given bellow:
+that is equivalent to the provided example in Scala is given below:
 
 Refer to the [`SVMWithSGD` Java docs](api/java/org/apache/spark/mllib/classification/SVMWithSGD.html) and [`SVMModel` Java docs](api/java/org/apache/spark/mllib/classification/SVMModel.html) for details on the API.
 
@@ -612,7 +612,7 @@ All of MLlib's methods use Java-friendly types, so you can import and call them
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate `JavaRDD` class. You can convert a Java RDD to a Scala one by
 calling `.rdd()` on your `JavaRDD` object. The corresponding Java example to
-the Scala snippet provided, is presented bellow:
+the Scala snippet provided, is presented below:
 
 Refer to the [`LinearRegressionWithSGD` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionWithSGD.html) and [`LinearRegressionModel` Java docs](api/java/org/apache/spark/mllib/regression/LinearRegressionModel.html) for details on the API.
 

From 2d000124b72d0ff9e3ecefa03923405642516c4c Mon Sep 17 00:00:00 2001
From: KaiXinXiaoLei <huleilei1@huawei.com>
Date: Thu, 15 Oct 2015 14:48:01 -0700
Subject: [PATCH 068/139] [SPARK-10515] When killing executor, the pending
 replacement executors should not be lost

If the heartbeat receiver kills executors (and new ones are not registered to replace them), the idle timeout for the old executors will be lost (and then change a total number of executors requested by Driver), So new ones will be not to asked to replace them.
For example, executorsPendingToRemove=Set(1), and executor 2 is idle timeout before a new executor is asked to replace executor 1. Then driver kill executor 2, and sending RequestExecutors to AM. But executorsPendingToRemove=Set(1,2), So AM doesn't allocate a executor to replace 1.

see: https://github.com/apache/spark/pull/8668

Author: KaiXinXiaoLei <huleilei1@huawei.com>
Author: huleilei <huleilei1@huawei.com>

Closes #8945 from KaiXinXiaoLei/pendingexecutor.
---
 .../CoarseGrainedSchedulerBackend.scala       |  2 ++
 .../StandaloneDynamicAllocationSuite.scala    | 35 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 18771f79b44bb..55a564b5c8eac 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -438,6 +438,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
     if (!replace) {
       doRequestTotalExecutors(
         numExistingExecutors + numPendingExecutors - executorsPendingToRemove.size)
+    } else {
+      numPendingExecutors += knownExecutors.size
     }
 
     doKillExecutors(executorsToKill)
diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index 2e2fa22eb4772..d145e78834b1b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -369,6 +369,41 @@ class StandaloneDynamicAllocationSuite
     assert(apps.head.getExecutorLimit === 1)
   }
 
+  test("the pending replacement executors should not be lost (SPARK-10515)") {
+    sc = new SparkContext(appConf)
+    val appId = sc.applicationId
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.size === 1)
+      assert(apps.head.id === appId)
+      assert(apps.head.executors.size === 2)
+      assert(apps.head.getExecutorLimit === Int.MaxValue)
+    }
+    // sync executors between the Master and the driver, needed because
+    // the driver refuses to kill executors it does not know about
+    syncExecutors(sc)
+    val executors = getExecutorIds(sc)
+    assert(executors.size === 2)
+    // kill executor 1, and replace it
+    assert(sc.killAndReplaceExecutor(executors.head))
+    eventually(timeout(10.seconds), interval(10.millis)) {
+      val apps = getApplications()
+      assert(apps.head.executors.size === 2)
+    }
+
+    var apps = getApplications()
+    // kill executor 1
+    assert(sc.killExecutor(executors.head))
+    apps = getApplications()
+    assert(apps.head.executors.size === 2)
+    assert(apps.head.getExecutorLimit === 2)
+    // kill executor 2
+    assert(sc.killExecutor(executors(1)))
+    apps = getApplications()
+    assert(apps.head.executors.size === 1)
+    assert(apps.head.getExecutorLimit === 1)
+  }
+
   // ===============================
   // | Utility methods for testing |
   // ===============================

From 3b364ff0a4f38c2b8023429a55623de32be5f329 Mon Sep 17 00:00:00 2001
From: Andrew Or <andrew@databricks.com>
Date: Thu, 15 Oct 2015 14:50:01 -0700
Subject: [PATCH 069/139] [SPARK-11078] Ensure spilling tests actually spill

#9084 uncovered that many tests that test spilling don't actually spill. This is a follow-up patch to fix that to ensure our unit tests actually catch potential bugs in spilling. The size of this patch is inflated by the refactoring of `ExternalSorterSuite`, which had a lot of duplicate code and logic.

Author: Andrew Or <andrew@databricks.com>

Closes #9124 from andrewor14/spilling-tests.
---
 .../scala/org/apache/spark/TestUtils.scala    |  51 +
 .../spark/shuffle/ShuffleMemoryManager.scala  |   6 +-
 .../collection/ExternalAppendOnlyMap.scala    |   6 +
 .../spark/util/collection/Spillable.scala     |  37 +-
 .../org/apache/spark/DistributedSuite.scala   |  39 +-
 .../ExternalAppendOnlyMapSuite.scala          | 103 ++-
 .../util/collection/ExternalSorterSuite.scala | 871 ++++++++----------
 .../execution/TestShuffleMemoryManager.scala  |   2 +
 8 files changed, 534 insertions(+), 581 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/TestUtils.scala b/core/src/main/scala/org/apache/spark/TestUtils.scala
index 888763a3e8ebf..acfe751f6c746 100644
--- a/core/src/main/scala/org/apache/spark/TestUtils.scala
+++ b/core/src/main/scala/org/apache/spark/TestUtils.scala
@@ -24,10 +24,14 @@ import java.util.Arrays
 import java.util.jar.{JarEntry, JarOutputStream}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
 
 import com.google.common.io.{ByteStreams, Files}
 import javax.tools.{JavaFileObject, SimpleJavaFileObject, ToolProvider}
 
+import org.apache.spark.executor.TaskMetrics
+import org.apache.spark.scheduler._
 import org.apache.spark.util.Utils
 
 /**
@@ -154,4 +158,51 @@ private[spark] object TestUtils {
       "  @Override public String toString() { return \"" + toStringValue + "\"; }}")
     createCompiledClass(className, destDir, sourceFile, classpathUrls)
   }
+
+  /**
+   * Run some code involving jobs submitted to the given context and assert that the jobs spilled.
+   */
+  def assertSpilled[T](sc: SparkContext, identifier: String)(body: => T): Unit = {
+    val spillListener = new SpillListener
+    sc.addSparkListener(spillListener)
+    body
+    assert(spillListener.numSpilledStages > 0, s"expected $identifier to spill, but did not")
+  }
+
+  /**
+   * Run some code involving jobs submitted to the given context and assert that the jobs
+   * did not spill.
+   */
+  def assertNotSpilled[T](sc: SparkContext, identifier: String)(body: => T): Unit = {
+    val spillListener = new SpillListener
+    sc.addSparkListener(spillListener)
+    body
+    assert(spillListener.numSpilledStages == 0, s"expected $identifier to not spill, but did")
+  }
+
+}
+
+
+/**
+ * A [[SparkListener]] that detects whether spills have occurred in Spark jobs.
+ */
+private class SpillListener extends SparkListener {
+  private val stageIdToTaskMetrics = new mutable.HashMap[Int, ArrayBuffer[TaskMetrics]]
+  private val spilledStageIds = new mutable.HashSet[Int]
+
+  def numSpilledStages: Int = spilledStageIds.size
+
+  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
+    stageIdToTaskMetrics.getOrElseUpdate(
+      taskEnd.stageId, new ArrayBuffer[TaskMetrics]) += taskEnd.taskMetrics
+  }
+
+  override def onStageCompleted(stageComplete: SparkListenerStageCompleted): Unit = {
+    val stageId = stageComplete.stageInfo.stageId
+    val metrics = stageIdToTaskMetrics.remove(stageId).toSeq.flatten
+    val spilled = metrics.map(_.memoryBytesSpilled).sum > 0
+    if (spilled) {
+      spilledStageIds += stageId
+    }
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
index aaf543ce9232a..9bd18da47f1a2 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/ShuffleMemoryManager.scala
@@ -139,8 +139,10 @@ class ShuffleMemoryManager protected (
       throw new SparkException(
         s"Internal error: release called on $numBytes bytes but task only has $curMem")
     }
-    taskMemory(taskAttemptId) -= numBytes
-    memoryManager.releaseExecutionMemory(numBytes)
+    if (taskMemory.contains(taskAttemptId)) {
+      taskMemory(taskAttemptId) -= numBytes
+      memoryManager.releaseExecutionMemory(numBytes)
+    }
     memoryManager.notifyAll() // Notify waiters in tryToAcquire that memory has been freed
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 6a96b5dc12684..cfa58f5ef408a 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -95,6 +95,12 @@ class ExternalAppendOnlyMap[K, V, C](
   private val keyComparator = new HashComparator[K]
   private val ser = serializer.newInstance()
 
+  /**
+   * Number of files this map has spilled so far.
+   * Exposed for testing.
+   */
+  private[collection] def numSpills: Int = spilledMaps.size
+
   /**
    * Insert the given key and value into the map.
    */
diff --git a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
index 747ecf075a397..d2a68ca7a3b4c 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/Spillable.scala
@@ -43,10 +43,15 @@ private[spark] trait Spillable[C] extends Logging {
   private[this] val shuffleMemoryManager = SparkEnv.get.shuffleMemoryManager
 
   // Initial threshold for the size of a collection before we start tracking its memory usage
-  // Exposed for testing
+  // For testing only
   private[this] val initialMemoryThreshold: Long =
     SparkEnv.get.conf.getLong("spark.shuffle.spill.initialMemoryThreshold", 5 * 1024 * 1024)
 
+  // Force this collection to spill when there are this many elements in memory
+  // For testing only
+  private[this] val numElementsForceSpillThreshold: Long =
+    SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", Long.MaxValue)
+
   // Threshold for this collection's size in bytes before we start tracking its memory usage
   // To avoid a large number of small spills, initialize this to a value orders of magnitude > 0
   private[this] var myMemoryThreshold = initialMemoryThreshold
@@ -69,27 +74,27 @@ private[spark] trait Spillable[C] extends Logging {
    * @return true if `collection` was spilled to disk; false otherwise
    */
   protected def maybeSpill(collection: C, currentMemory: Long): Boolean = {
+    var shouldSpill = false
     if (elementsRead % 32 == 0 && currentMemory >= myMemoryThreshold) {
       // Claim up to double our current memory from the shuffle memory pool
       val amountToRequest = 2 * currentMemory - myMemoryThreshold
       val granted = shuffleMemoryManager.tryToAcquire(amountToRequest)
       myMemoryThreshold += granted
-      if (myMemoryThreshold <= currentMemory) {
-        // We were granted too little memory to grow further (either tryToAcquire returned 0,
-        // or we already had more memory than myMemoryThreshold); spill the current collection
-        _spillCount += 1
-        logSpillage(currentMemory)
-
-        spill(collection)
-
-        _elementsRead = 0
-        // Keep track of spills, and release memory
-        _memoryBytesSpilled += currentMemory
-        releaseMemoryForThisThread()
-        return true
-      }
+      // If we were granted too little memory to grow further (either tryToAcquire returned 0,
+      // or we already had more memory than myMemoryThreshold), spill the current collection
+      shouldSpill = currentMemory >= myMemoryThreshold
+    }
+    shouldSpill = shouldSpill || _elementsRead > numElementsForceSpillThreshold
+    // Actually spill
+    if (shouldSpill) {
+      _spillCount += 1
+      logSpillage(currentMemory)
+      spill(collection)
+      _elementsRead = 0
+      _memoryBytesSpilled += currentMemory
+      releaseMemoryForThisThread()
     }
-    false
+    shouldSpill
   }
 
   /**
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index 34a4bb968e732..1c3f2bc315ddc 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -203,22 +203,35 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
   }
 
   test("compute without caching when no partitions fit in memory") {
-    sc = new SparkContext(clusterUrl, "test")
-    // data will be 4 million * 4 bytes = 16 MB in size, but our memoryFraction set the cache
-    // to only 50 KB (0.0001 of 512 MB), so no partitions should fit in memory
-    val data = sc.parallelize(1 to 4000000, 2).persist(StorageLevel.MEMORY_ONLY_SER)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
+    val size = 10000
+    val conf = new SparkConf()
+      .set("spark.storage.unrollMemoryThreshold", "1024")
+      .set("spark.testing.memory", (size / 2).toString)
+    sc = new SparkContext(clusterUrl, "test", conf)
+    val data = sc.parallelize(1 to size, 2).persist(StorageLevel.MEMORY_ONLY)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    // ensure only a subset of partitions were cached
+    val rddBlocks = sc.env.blockManager.master.getMatchingBlockIds(_.isRDD, askSlaves = true)
+    assert(rddBlocks.size === 0, s"expected no RDD blocks, found ${rddBlocks.size}")
   }
 
   test("compute when only some partitions fit in memory") {
-    sc = new SparkContext(clusterUrl, "test", new SparkConf)
-    // TODO: verify that only a subset of partitions fit in memory (SPARK-11078)
-    val data = sc.parallelize(1 to 4000000, 20).persist(StorageLevel.MEMORY_ONLY_SER)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
-    assert(data.count() === 4000000)
+    val size = 10000
+    val numPartitions = 10
+    val conf = new SparkConf()
+      .set("spark.storage.unrollMemoryThreshold", "1024")
+      .set("spark.testing.memory", (size * numPartitions).toString)
+    sc = new SparkContext(clusterUrl, "test", conf)
+    val data = sc.parallelize(1 to size, numPartitions).persist(StorageLevel.MEMORY_ONLY)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    assert(data.count() === size)
+    // ensure only a subset of partitions were cached
+    val rddBlocks = sc.env.blockManager.master.getMatchingBlockIds(_.isRDD, askSlaves = true)
+    assert(rddBlocks.size > 0, "no RDD blocks found")
+    assert(rddBlocks.size < numPartitions, s"too many RDD blocks found, expected <$numPartitions")
   }
 
   test("passing environment variables to cluster") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index 0a03c32c647ae..5cb506ea2164e 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -22,9 +22,10 @@ import scala.collection.mutable.ArrayBuffer
 import org.apache.spark._
 import org.apache.spark.io.CompressionCodec
 
-// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
 
 class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
+  import TestUtils.{assertNotSpilled, assertSpilled}
+
   private val allCompressionCodecs = CompressionCodec.ALL_COMPRESSION_CODECS
   private def createCombiner[T](i: T) = ArrayBuffer[T](i)
   private def mergeValue[T](buffer: ArrayBuffer[T], i: T): ArrayBuffer[T] = buffer += i
@@ -244,54 +245,53 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
    * If a compression codec is provided, use it. Otherwise, do not compress spills.
    */
   private def testSimpleSpilling(codec: Option[String] = None): Unit = {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true, codec)  // Load defaults for Spark home
+    conf.set("spark.shuffle.manager", "hash") // avoid using external sorter
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
-    // reduceByKey - should spill ~8 times
-    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
-    val resultA = rddA.reduceByKey(math.max).collect()
-    assert(resultA.length === 50000)
-    resultA.foreach { case (k, v) =>
-      assert(v === k * 2 + 1, s"Value for $k was wrong: expected ${k * 2 + 1}, got $v")
+    assertSpilled(sc, "reduceByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }.reduceByKey(math.max).collect()
+      assert(result.length === size / 2)
+      result.foreach { case (k, v) =>
+        val expected = k * 2 + 1
+        assert(v === expected, s"Value for $k was wrong: expected $expected, got $v")
+      }
     }
 
-    // groupByKey - should spill ~17 times
-    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultB = rddB.groupByKey().collect()
-    assert(resultB.length === 25000)
-    resultB.foreach { case (i, seq) =>
-      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
-      assert(seq.toSet === expected,
-        s"Value for $i was wrong: expected $expected, got ${seq.toSet}")
+    assertSpilled(sc, "groupByKey") {
+      val result = sc.parallelize(0 until size).map { i => (i / 2, i) }.groupByKey().collect()
+      assert(result.length == size / 2)
+      result.foreach { case (i, seq) =>
+        val actual = seq.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual === expected, s"Value for $i was wrong: expected $expected, got $actual")
+      }
     }
 
-    // cogroup - should spill ~7 times
-    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
-    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
-    val resultC = rddC1.cogroup(rddC2).collect()
-    assert(resultC.length === 10000)
-    resultC.foreach { case (i, (seq1, seq2)) =>
-      i match {
-        case 0 =>
-          assert(seq1.toSet === Set[Int](0))
-          assert(seq2.toSet === Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
-        case 1 =>
-          assert(seq1.toSet === Set[Int](1))
-          assert(seq2.toSet === Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
-        case 5000 =>
-          assert(seq1.toSet === Set[Int](5000))
-          assert(seq2.toSet === Set[Int]())
-        case 9999 =>
-          assert(seq1.toSet === Set[Int](9999))
-          assert(seq2.toSet === Set[Int]())
-        case _ =>
+    assertSpilled(sc, "cogroup") {
+      val rdd1 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val rdd2 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val result = rdd1.cogroup(rdd2).collect()
+      assert(result.length === size / 2)
+      result.foreach { case (i, (seq1, seq2)) =>
+        val actual1 = seq1.toSet
+        val actual2 = seq2.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual1 === expected, s"Value 1 for $i was wrong: expected $expected, got $actual1")
+        assert(actual2 === expected, s"Value 2 for $i was wrong: expected $expected, got $actual2")
       }
     }
+
     sc.stop()
   }
 
   test("spilling with hash collisions") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[String]
 
@@ -315,11 +315,12 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
       assert(w1.hashCode === w2.hashCode)
     }
 
-    map.insertAll((1 to 100000).iterator.map(_.toString).map(i => (i, i)))
+    map.insertAll((1 to size).iterator.map(_.toString).map(i => (i, i)))
     collisionPairs.foreach { case (w1, w2) =>
       map.insert(w1, w2)
       map.insert(w2, w1)
     }
+    assert(map.numSpills > 0, "map did not spill")
 
     // A map of collision pairs in both directions
     val collisionPairsMap = (collisionPairs ++ collisionPairs.map(_.swap)).toMap
@@ -334,22 +335,25 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2.equals(expectedValue))
       count += 1
     }
-    assert(count === 100000 + collisionPairs.size * 2)
+    assert(count === size + collisionPairs.size * 2)
     sc.stop()
   }
 
   test("spilling with many hash collisions") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = new ExternalAppendOnlyMap[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
 
     // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
     // problems if the map fails to group together the objects with the same code (SPARK-2043).
     for (i <- 1 to 10) {
-      for (j <- 1 to 10000) {
+      for (j <- 1 to size) {
         map.insert(FixedHashObject(j, j % 2), 1)
       }
     }
+    assert(map.numSpills > 0, "map did not spill")
 
     val it = map.iterator
     var count = 0
@@ -358,17 +362,20 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2 === 10)
       count += 1
     }
-    assert(count === 10000)
+    assert(count === size)
     sc.stop()
   }
 
   test("spilling with hash collisions using the Int.MaxValue key") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
-    (1 to 100000).foreach { i => map.insert(i, i) }
+    (1 to size).foreach { i => map.insert(i, i) }
     map.insert(Int.MaxValue, Int.MaxValue)
+    assert(map.numSpills > 0, "map did not spill")
 
     val it = map.iterator
     while (it.hasNext) {
@@ -379,14 +386,17 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("spilling with null keys and values") {
+    val size = 1000
     val conf = createSparkConf(loadDefaults = true)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
     val map = createExternalMap[Int]
 
-    map.insertAll((1 to 100000).iterator.map(i => (i, i)))
+    map.insertAll((1 to size).iterator.map(i => (i, i)))
     map.insert(null.asInstanceOf[Int], 1)
     map.insert(1, null.asInstanceOf[Int])
     map.insert(null.asInstanceOf[Int], null.asInstanceOf[Int])
+    assert(map.numSpills > 0, "map did not spill")
 
     val it = map.iterator
     while (it.hasNext) {
@@ -397,17 +407,22 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("external aggregation updates peak execution memory") {
+    val spillThreshold = 1000
     val conf = createSparkConf(loadDefaults = false)
       .set("spark.shuffle.manager", "hash") // make sure we're not also using ExternalSorter
-      .set("spark.testing.memory", (10 * 1024 * 1024).toString)
+      .set("spark.shuffle.spill.numElementsForceSpillThreshold", spillThreshold.toString)
     sc = new SparkContext("local", "test", conf)
     // No spilling
     AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map without spilling") {
-      sc.parallelize(1 to 10, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      assertNotSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold / 2, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      }
     }
     // With spilling
     AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external map with spilling") {
-      sc.parallelize(1 to 1000 * 1000, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      assertSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold * 3, 2).map { i => (i, i) }.reduceByKey(_ + _).count()
+      }
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
index 651c7eaa65ff5..e2cb791771d99 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSuite.scala
@@ -18,535 +18,92 @@
 package org.apache.spark.util.collection
 
 import scala.collection.mutable.ArrayBuffer
-
 import scala.util.Random
 
 import org.apache.spark._
 import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
 
-// TODO: some of these spilling tests probably aren't actually spilling (SPARK-11078)
 
 class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
-  private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
-    val conf = new SparkConf(loadDefaults)
-    if (kryo) {
-      conf.set("spark.serializer", classOf[KryoSerializer].getName)
-    } else {
-      // Make the Java serializer write a reset instruction (TC_RESET) after each object to test
-      // for a bug we had with bytes written past the last object in a batch (SPARK-2792)
-      conf.set("spark.serializer.objectStreamReset", "1")
-      conf.set("spark.serializer", classOf[JavaSerializer].getName)
-    }
-    conf.set("spark.shuffle.sort.bypassMergeThreshold", "0")
-    // Ensure that we actually have multiple batches per spill file
-    conf.set("spark.shuffle.spill.batchSize", "10")
-    conf.set("spark.testing.memory", "2000000")
-    conf
-  }
-
-  test("empty data stream with kryo ser") {
-    emptyDataStream(createSparkConf(false, true))
-  }
-
-  test("empty data stream with java ser") {
-    emptyDataStream(createSparkConf(false, false))
-  }
-
-  def emptyDataStream(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val ord = implicitly[Ordering[Int]]
-
-    // Both aggregator and ordering
-    val sorter = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
-    assert(sorter.iterator.toSeq === Seq())
-    sorter.stop()
-
-    // Only aggregator
-    val sorter2 = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(3)), None, None)
-    assert(sorter2.iterator.toSeq === Seq())
-    sorter2.stop()
-
-    // Only ordering
-    val sorter3 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    assert(sorter3.iterator.toSeq === Seq())
-    sorter3.stop()
-
-    // Neither aggregator nor ordering
-    val sorter4 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), None, None)
-    assert(sorter4.iterator.toSeq === Seq())
-    sorter4.stop()
-  }
+  import TestUtils.{assertNotSpilled, assertSpilled}
 
-  test("few elements per partition with kryo ser") {
-    fewElementsPerPartition(createSparkConf(false, true))
-  }
+  testWithMultipleSer("empty data stream")(emptyDataStream)
 
-  test("few elements per partition with java ser") {
-    fewElementsPerPartition(createSparkConf(false, false))
-  }
+  testWithMultipleSer("few elements per partition")(fewElementsPerPartition)
 
-  def fewElementsPerPartition(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val ord = implicitly[Ordering[Int]]
-    val elements = Set((1, 1), (2, 2), (5, 5))
-    val expected = Set(
-      (0, Set()), (1, Set((1, 1))), (2, Set((2, 2))), (3, Set()), (4, Set()),
-      (5, Set((5, 5))), (6, Set()))
-
-    // Both aggregator and ordering
-    val sorter = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(7)), Some(ord), None)
-    sorter.insertAll(elements.iterator)
-    assert(sorter.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter.stop()
-
-    // Only aggregator
-    val sorter2 = new ExternalSorter[Int, Int, Int](
-      Some(agg), Some(new HashPartitioner(7)), None, None)
-    sorter2.insertAll(elements.iterator)
-    assert(sorter2.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter2.stop()
+  testWithMultipleSer("empty partitions with spilling")(emptyPartitionsWithSpilling)
 
-    // Only ordering
-    val sorter3 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), Some(ord), None)
-    sorter3.insertAll(elements.iterator)
-    assert(sorter3.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter3.stop()
-
-    // Neither aggregator nor ordering
-    val sorter4 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), None, None)
-    sorter4.insertAll(elements.iterator)
-    assert(sorter4.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
-    sorter4.stop()
-  }
-
-  test("empty partitions with spilling with kryo ser") {
-    emptyPartitionsWithSpilling(createSparkConf(false, true))
+  // Load defaults, otherwise SPARK_HOME is not found
+  testWithMultipleSer("spilling in local cluster", loadDefaults = true) {
+    (conf: SparkConf) => testSpillingInLocalCluster(conf, 2)
   }
 
-  test("empty partitions with spilling with java ser") {
-    emptyPartitionsWithSpilling(createSparkConf(false, false))
-  }
-
-  def emptyPartitionsWithSpilling(conf: SparkConf) {
-    conf.set("spark.shuffle.spill.initialMemoryThreshold", "512")
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val ord = implicitly[Ordering[Int]]
-    val elements = Iterator((1, 1), (5, 5)) ++ (0 until 100000).iterator.map(x => (2, 2))
-
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(7)), Some(ord), None)
-    sorter.insertAll(elements)
-    assert(sc.env.blockManager.diskBlockManager.getAllFiles().length > 0) // Make sure it spilled
-    val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
-    assert(iter.next() === (0, Nil))
-    assert(iter.next() === (1, List((1, 1))))
-    assert(iter.next() === (2, (0 until 100000).map(x => (2, 2)).toList))
-    assert(iter.next() === (3, Nil))
-    assert(iter.next() === (4, Nil))
-    assert(iter.next() === (5, List((5, 5))))
-    assert(iter.next() === (6, Nil))
-    sorter.stop()
-  }
-
-  test("spilling in local cluster with kryo ser") {
-    // Load defaults, otherwise SPARK_HOME is not found
-    testSpillingInLocalCluster(createSparkConf(true, true))
-  }
-
-  test("spilling in local cluster with java ser") {
-    // Load defaults, otherwise SPARK_HOME is not found
-    testSpillingInLocalCluster(createSparkConf(true, false))
-  }
-
-  def testSpillingInLocalCluster(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
-
-    // reduceByKey - should spill ~8 times
-    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
-    val resultA = rddA.reduceByKey(math.max).collect()
-    assert(resultA.length == 50000)
-    resultA.foreach { case(k, v) =>
-      if (v != k * 2 + 1) {
-        fail(s"Value for ${k} was wrong: expected ${k * 2 + 1}, got ${v}")
-      }
-    }
-
-    // groupByKey - should spill ~17 times
-    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultB = rddB.groupByKey().collect()
-    assert(resultB.length == 25000)
-    resultB.foreach { case(i, seq) =>
-      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
-      if (seq.toSet != expected) {
-        fail(s"Value for ${i} was wrong: expected ${expected}, got ${seq.toSet}")
-      }
-    }
-
-    // cogroup - should spill ~7 times
-    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
-    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
-    val resultC = rddC1.cogroup(rddC2).collect()
-    assert(resultC.length == 10000)
-    resultC.foreach { case(i, (seq1, seq2)) =>
-      i match {
-        case 0 =>
-          assert(seq1.toSet == Set[Int](0))
-          assert(seq2.toSet == Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
-        case 1 =>
-          assert(seq1.toSet == Set[Int](1))
-          assert(seq2.toSet == Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
-        case 5000 =>
-          assert(seq1.toSet == Set[Int](5000))
-          assert(seq2.toSet == Set[Int]())
-        case 9999 =>
-          assert(seq1.toSet == Set[Int](9999))
-          assert(seq2.toSet == Set[Int]())
-        case _ =>
-      }
-    }
-
-    // larger cogroup - should spill ~7 times
-    val rddD1 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val rddD2 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val resultD = rddD1.cogroup(rddD2).collect()
-    assert(resultD.length == 5000)
-    resultD.foreach { case(i, (seq1, seq2)) =>
-      val expected = Set(i * 2, i * 2 + 1)
-      if (seq1.toSet != expected) {
-        fail(s"Value 1 for ${i} was wrong: expected ${expected}, got ${seq1.toSet}")
-      }
-      if (seq2.toSet != expected) {
-        fail(s"Value 2 for ${i} was wrong: expected ${expected}, got ${seq2.toSet}")
-      }
-    }
-
-    // sortByKey - should spill ~17 times
-    val rddE = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultE = rddE.sortByKey().collect().toSeq
-    assert(resultE === (0 until 100000).map(i => (i/4, i)).toSeq)
-  }
-
-  test("spilling in local cluster with many reduce tasks with kryo ser") {
-    spillingInLocalClusterWithManyReduceTasks(createSparkConf(true, true))
-  }
-
-  test("spilling in local cluster with many reduce tasks with java ser") {
-    spillingInLocalClusterWithManyReduceTasks(createSparkConf(true, false))
-  }
-
-  def spillingInLocalClusterWithManyReduceTasks(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local-cluster[2,1,1024]", "test", conf)
-
-    // reduceByKey - should spill ~4 times per executor
-    val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
-    val resultA = rddA.reduceByKey(math.max _, 100).collect()
-    assert(resultA.length == 50000)
-    resultA.foreach { case(k, v) =>
-      if (v != k * 2 + 1) {
-        fail(s"Value for ${k} was wrong: expected ${k * 2 + 1}, got ${v}")
-      }
-    }
-
-    // groupByKey - should spill ~8 times per executor
-    val rddB = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultB = rddB.groupByKey(100).collect()
-    assert(resultB.length == 25000)
-    resultB.foreach { case(i, seq) =>
-      val expected = Set(i * 4, i * 4 + 1, i * 4 + 2, i * 4 + 3)
-      if (seq.toSet != expected) {
-        fail(s"Value for ${i} was wrong: expected ${expected}, got ${seq.toSet}")
-      }
-    }
-
-    // cogroup - should spill ~4 times per executor
-    val rddC1 = sc.parallelize(0 until 10000).map(i => (i, i))
-    val rddC2 = sc.parallelize(0 until 10000).map(i => (i%1000, i))
-    val resultC = rddC1.cogroup(rddC2, 100).collect()
-    assert(resultC.length == 10000)
-    resultC.foreach { case(i, (seq1, seq2)) =>
-      i match {
-        case 0 =>
-          assert(seq1.toSet == Set[Int](0))
-          assert(seq2.toSet == Set[Int](0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000))
-        case 1 =>
-          assert(seq1.toSet == Set[Int](1))
-          assert(seq2.toSet == Set[Int](1, 1001, 2001, 3001, 4001, 5001, 6001, 7001, 8001, 9001))
-        case 5000 =>
-          assert(seq1.toSet == Set[Int](5000))
-          assert(seq2.toSet == Set[Int]())
-        case 9999 =>
-          assert(seq1.toSet == Set[Int](9999))
-          assert(seq2.toSet == Set[Int]())
-        case _ =>
-      }
-    }
-
-    // larger cogroup - should spill ~4 times per executor
-    val rddD1 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val rddD2 = sc.parallelize(0 until 10000).map(i => (i/2, i))
-    val resultD = rddD1.cogroup(rddD2).collect()
-    assert(resultD.length == 5000)
-    resultD.foreach { case(i, (seq1, seq2)) =>
-      val expected = Set(i * 2, i * 2 + 1)
-      if (seq1.toSet != expected) {
-        fail(s"Value 1 for ${i} was wrong: expected ${expected}, got ${seq1.toSet}")
-      }
-      if (seq2.toSet != expected) {
-        fail(s"Value 2 for ${i} was wrong: expected ${expected}, got ${seq2.toSet}")
-      }
-    }
-
-    // sortByKey - should spill ~8 times per executor
-    val rddE = sc.parallelize(0 until 100000).map(i => (i/4, i))
-    val resultE = rddE.sortByKey().collect().toSeq
-    assert(resultE === (0 until 100000).map(i => (i/4, i)).toSeq)
+  testWithMultipleSer("spilling in local cluster with many reduce tasks", loadDefaults = true) {
+    (conf: SparkConf) => testSpillingInLocalCluster(conf, 100)
   }
 
   test("cleanup of intermediate files in sorter") {
-    val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val ord = implicitly[Ordering[Int]]
-
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter.insertAll((0 until 120000).iterator.map(i => (i, i)))
-    assert(diskBlockManager.getAllFiles().length > 0)
-    sorter.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
-
-    val sorter2 = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter2.insertAll((0 until 120000).iterator.map(i => (i, i)))
-    assert(diskBlockManager.getAllFiles().length > 0)
-    assert(sorter2.iterator.toSet === (0 until 120000).map(i => (i, i)).toSet)
-    sorter2.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
+    cleanupIntermediateFilesInSorter(withFailures = false)
   }
 
-  test("cleanup of intermediate files in sorter if there are errors") {
-    val conf = createSparkConf(true, false)  // Load defaults, otherwise SPARK_HOME is not found
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val ord = implicitly[Ordering[Int]]
-
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    intercept[SparkException] {
-      sorter.insertAll((0 until 120000).iterator.map(i => {
-        if (i == 119990) {
-          throw new SparkException("Intentional failure")
-        }
-        (i, i)
-      }))
-    }
-    assert(diskBlockManager.getAllFiles().length > 0)
-    sorter.stop()
-    assert(diskBlockManager.getAllBlocks().length === 0)
+  test("cleanup of intermediate files in sorter with failures") {
+    cleanupIntermediateFilesInSorter(withFailures = true)
   }
 
   test("cleanup of intermediate files in shuffle") {
-    val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val data = sc.parallelize(0 until 100000, 2).map(i => (i, i))
-    assert(data.reduceByKey(_ + _).count() === 100000)
-
-    // After the shuffle, there should be only 4 files on disk: our two map output files and
-    // their index files. All other intermediate files should've been deleted.
-    assert(diskBlockManager.getAllFiles().length === 4)
-  }
-
-  test("cleanup of intermediate files in shuffle with errors") {
-    val conf = createSparkConf(false, false)
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-    val diskBlockManager = SparkEnv.get.blockManager.diskBlockManager
-
-    val data = sc.parallelize(0 until 100000, 2).map(i => {
-      if (i == 99990) {
-        throw new Exception("Intentional failure")
-      }
-      (i, i)
-    })
-    intercept[SparkException] {
-      data.reduceByKey(_ + _).count()
-    }
-
-    // After the shuffle, there should be only 2 files on disk: the output of task 1 and its index.
-    // All other files (map 2's output and intermediate merge files) should've been deleted.
-    assert(diskBlockManager.getAllFiles().length === 2)
-  }
-
-  test("no partial aggregation or sorting with kryo ser") {
-    noPartialAggregationOrSorting(createSparkConf(false, true))
-  }
-
-  test("no partial aggregation or sorting with java ser") {
-    noPartialAggregationOrSorting(createSparkConf(false, false))
-  }
-
-  def noPartialAggregationOrSorting(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val sorter = new ExternalSorter[Int, Int, Int](None, Some(new HashPartitioner(3)), None, None)
-    sorter.insertAll((0 until 100000).iterator.map(i => (i / 4, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 100000).map(i => (i / 4, i)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
-  }
-
-  test("partial aggregation without spill with kryo ser") {
-    partialAggregationWithoutSpill(createSparkConf(false, true))
-  }
-
-  test("partial aggregation without spill with java ser") {
-    partialAggregationWithoutSpill(createSparkConf(false, false))
-  }
-
-  def partialAggregationWithoutSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), None, None)
-    sorter.insertAll((0 until 100).iterator.map(i => (i / 2, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 50).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
+    cleanupIntermediateFilesInShuffle(withFailures = false)
   }
 
-  test("partial aggregation with spill, no ordering with kryo ser") {
-    partialAggregationWIthSpillNoOrdering(createSparkConf(false, true))
+  test("cleanup of intermediate files in shuffle with failures") {
+    cleanupIntermediateFilesInShuffle(withFailures = true)
   }
 
-  test("partial aggregation with spill, no ordering with java ser") {
-    partialAggregationWIthSpillNoOrdering(createSparkConf(false, false))
+  testWithMultipleSer("no sorting or partial aggregation") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = false, withSpilling = false)
   }
 
-  def partialAggregationWIthSpillNoOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), None, None)
-    sorter.insertAll((0 until 100000).iterator.map(i => (i / 2, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 50000).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
+  testWithMultipleSer("no sorting or partial aggregation with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = false, withSpilling = true)
   }
 
-  test("partial aggregation with spill, with ordering with kryo ser") {
-    partialAggregationWithSpillWithOrdering(createSparkConf(false, true))
+  testWithMultipleSer("sorting, no partial aggregation") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = true, withSpilling = false)
   }
 
-
-  test("partial aggregation with spill, with ordering with java ser") {
-    partialAggregationWithSpillWithOrdering(createSparkConf(false, false))
+  testWithMultipleSer("sorting, no partial aggregation with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = false, withOrdering = true, withSpilling = true)
   }
 
-  def partialAggregationWithSpillWithOrdering(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
-    val ord = implicitly[Ordering[Int]]
-    val sorter = new ExternalSorter(Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
-
-    // avoid combine before spill
-    sorter.insertAll((0 until 50000).iterator.map(i => (i , 2 * i)))
-    sorter.insertAll((0 until 50000).iterator.map(i => (i, 2 * i + 1)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSet)}.toSet
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 50000).map(i => (i, i * 4 + 1)).filter(_._1 % 3 == p).toSet)
-    }).toSet
-    assert(results === expected)
+  testWithMultipleSer("partial aggregation, no sorting") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = false, withSpilling = false)
   }
 
-  test("sorting without aggregation, no spill with kryo ser") {
-    sortingWithoutAggregationNoSpill(createSparkConf(false, true))
+  testWithMultipleSer("partial aggregation, no sorting with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = false, withSpilling = true)
   }
 
-  test("sorting without aggregation, no spill with java ser") {
-    sortingWithoutAggregationNoSpill(createSparkConf(false, false))
+  testWithMultipleSer("partial aggregation and sorting") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = true, withSpilling = false)
   }
 
-  def sortingWithoutAggregationNoSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val ord = implicitly[Ordering[Int]]
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter.insertAll((0 until 100).iterator.map(i => (i, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSeq)}.toSeq
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 100).map(i => (i, i)).filter(_._1 % 3 == p).toSeq)
-    }).toSeq
-    assert(results === expected)
-  }
-
-  test("sorting without aggregation, with spill with kryo ser") {
-    sortingWithoutAggregationWithSpill(createSparkConf(false, true))
-  }
-
-  test("sorting without aggregation, with spill with java ser") {
-    sortingWithoutAggregationWithSpill(createSparkConf(false, false))
+  testWithMultipleSer("partial aggregation and sorting with spilling") { (conf: SparkConf) =>
+    basicSorterTest(conf, withPartialAgg = true, withOrdering = true, withSpilling = true)
   }
 
-  def sortingWithoutAggregationWithSpill(conf: SparkConf) {
-    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.sort.SortShuffleManager")
-    sc = new SparkContext("local", "test", conf)
-
-    val ord = implicitly[Ordering[Int]]
-    val sorter = new ExternalSorter[Int, Int, Int](
-      None, Some(new HashPartitioner(3)), Some(ord), None)
-    sorter.insertAll((0 until 100000).iterator.map(i => (i, i)))
-    val results = sorter.partitionedIterator.map{case (p, vs) => (p, vs.toSeq)}.toSeq
-    val expected = (0 until 3).map(p => {
-      (p, (0 until 100000).map(i => (i, i)).filter(_._1 % 3 == p).toSeq)
-    }).toSeq
-    assert(results === expected)
-  }
+  testWithMultipleSer("sort without breaking sorting contracts", loadDefaults = true)(
+    sortWithoutBreakingSortingContracts)
 
   test("spilling with hash collisions") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
     def mergeValue(buffer: ArrayBuffer[String], i: String): ArrayBuffer[String] = buffer += i
-    def mergeCombiners(buffer1: ArrayBuffer[String], buffer2: ArrayBuffer[String])
-      : ArrayBuffer[String] = buffer1 ++= buffer2
+    def mergeCombiners(
+        buffer1: ArrayBuffer[String],
+        buffer2: ArrayBuffer[String]): ArrayBuffer[String] = buffer1 ++= buffer2
 
     val agg = new Aggregator[String, String, ArrayBuffer[String]](
       createCombiner _, mergeValue _, mergeCombiners _)
@@ -574,10 +131,11 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       assert(w1.hashCode === w2.hashCode)
     }
 
-    val toInsert = (1 to 100000).iterator.map(_.toString).map(s => (s, s)) ++
+    val toInsert = (1 to size).iterator.map(_.toString).map(s => (s, s)) ++
       collisionPairs.iterator ++ collisionPairs.iterator.map(_.swap)
 
     sorter.insertAll(toInsert)
+    assert(sorter.numSpills > 0, "sorter did not spill")
 
     // A map of collision pairs in both directions
     val collisionPairsMap = (collisionPairs ++ collisionPairs.map(_.swap)).toMap
@@ -592,21 +150,21 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2.equals(expectedValue))
       count += 1
     }
-    assert(count === 100000 + collisionPairs.size * 2)
+    assert(count === size + collisionPairs.size * 2)
   }
 
   test("spilling with many hash collisions") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
-
     val agg = new Aggregator[FixedHashObject, Int, Int](_ => 1, _ + _, _ + _)
     val sorter = new ExternalSorter[FixedHashObject, Int, Int](Some(agg), None, None, None)
-
     // Insert 10 copies each of lots of objects whose hash codes are either 0 or 1. This causes
     // problems if the map fails to group together the objects with the same code (SPARK-2043).
-    val toInsert = for (i <- 1 to 10; j <- 1 to 10000) yield (FixedHashObject(j, j % 2), 1)
+    val toInsert = for (i <- 1 to 10; j <- 1 to size) yield (FixedHashObject(j, j % 2), 1)
     sorter.insertAll(toInsert.iterator)
-
+    assert(sorter.numSpills > 0, "sorter did not spill")
     val it = sorter.iterator
     var count = 0
     while (it.hasNext) {
@@ -614,11 +172,13 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       assert(kv._2 === 10)
       count += 1
     }
-    assert(count === 10000)
+    assert(count === size)
   }
 
   test("spilling with hash collisions using the Int.MaxValue key") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: Int): ArrayBuffer[Int] = ArrayBuffer[Int](i)
@@ -629,10 +189,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
 
     val agg = new Aggregator[Int, Int, ArrayBuffer[Int]](createCombiner, mergeValue, mergeCombiners)
     val sorter = new ExternalSorter[Int, Int, ArrayBuffer[Int]](Some(agg), None, None, None)
-
     sorter.insertAll(
-      (1 to 100000).iterator.map(i => (i, i)) ++ Iterator((Int.MaxValue, Int.MaxValue)))
-
+      (1 to size).iterator.map(i => (i, i)) ++ Iterator((Int.MaxValue, Int.MaxValue)))
+    assert(sorter.numSpills > 0, "sorter did not spill")
     val it = sorter.iterator
     while (it.hasNext) {
       // Should not throw NoSuchElementException
@@ -641,7 +200,9 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("spilling with null keys and values") {
-    val conf = createSparkConf(true, false)
+    val size = 1000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     def createCombiner(i: String): ArrayBuffer[String] = ArrayBuffer[String](i)
@@ -655,12 +216,12 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val sorter = new ExternalSorter[String, String, ArrayBuffer[String]](
       Some(agg), None, None, None)
 
-    sorter.insertAll((1 to 100000).iterator.map(i => (i.toString, i.toString)) ++ Iterator(
+    sorter.insertAll((1 to size).iterator.map(i => (i.toString, i.toString)) ++ Iterator(
       (null.asInstanceOf[String], "1"),
       ("1", null.asInstanceOf[String]),
       (null.asInstanceOf[String], null.asInstanceOf[String])
     ))
-
+    assert(sorter.numSpills > 0, "sorter did not spill")
     val it = sorter.iterator
     while (it.hasNext) {
       // Should not throw NullPointerException
@@ -668,16 +229,301 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
-  test("sort without breaking sorting contracts with kryo ser") {
-    sortWithoutBreakingSortingContracts(createSparkConf(true, true))
+  /* ============================= *
+   |  Helper test utility methods  |
+   * ============================= */
+
+  private def createSparkConf(loadDefaults: Boolean, kryo: Boolean): SparkConf = {
+    val conf = new SparkConf(loadDefaults)
+    if (kryo) {
+      conf.set("spark.serializer", classOf[KryoSerializer].getName)
+    } else {
+      // Make the Java serializer write a reset instruction (TC_RESET) after each object to test
+      // for a bug we had with bytes written past the last object in a batch (SPARK-2792)
+      conf.set("spark.serializer.objectStreamReset", "1")
+      conf.set("spark.serializer", classOf[JavaSerializer].getName)
+    }
+    conf.set("spark.shuffle.sort.bypassMergeThreshold", "0")
+    // Ensure that we actually have multiple batches per spill file
+    conf.set("spark.shuffle.spill.batchSize", "10")
+    conf.set("spark.shuffle.spill.initialMemoryThreshold", "512")
+    conf
+  }
+
+  /**
+   * Run a test multiple times, each time with a different serializer.
+   */
+  private def testWithMultipleSer(
+      name: String,
+      loadDefaults: Boolean = false)(body: (SparkConf => Unit)): Unit = {
+    test(name + " with kryo ser") {
+      body(createSparkConf(loadDefaults, kryo = true))
+    }
+    test(name + " with java ser") {
+      body(createSparkConf(loadDefaults, kryo = false))
+    }
   }
 
-  test("sort without breaking sorting contracts with java ser") {
-    sortWithoutBreakingSortingContracts(createSparkConf(true, false))
+  /* =========================================== *
+   |  Helper methods that contain the test body  |
+   * =========================================== */
+
+  private def emptyDataStream(conf: SparkConf) {
+    conf.set("spark.shuffle.manager", "sort")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+
+    // Both aggregator and ordering
+    val sorter = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(3)), Some(ord), None)
+    assert(sorter.iterator.toSeq === Seq())
+    sorter.stop()
+
+    // Only aggregator
+    val sorter2 = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(3)), None, None)
+    assert(sorter2.iterator.toSeq === Seq())
+    sorter2.stop()
+
+    // Only ordering
+    val sorter3 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), Some(ord), None)
+    assert(sorter3.iterator.toSeq === Seq())
+    sorter3.stop()
+
+    // Neither aggregator nor ordering
+    val sorter4 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), None, None)
+    assert(sorter4.iterator.toSeq === Seq())
+    sorter4.stop()
+  }
+
+  private def fewElementsPerPartition(conf: SparkConf) {
+    conf.set("spark.shuffle.manager", "sort")
+    sc = new SparkContext("local", "test", conf)
+
+    val agg = new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j)
+    val ord = implicitly[Ordering[Int]]
+    val elements = Set((1, 1), (2, 2), (5, 5))
+    val expected = Set(
+      (0, Set()), (1, Set((1, 1))), (2, Set((2, 2))), (3, Set()), (4, Set()),
+      (5, Set((5, 5))), (6, Set()))
+
+    // Both aggregator and ordering
+    val sorter = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(7)), Some(ord), None)
+    sorter.insertAll(elements.iterator)
+    assert(sorter.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter.stop()
+
+    // Only aggregator
+    val sorter2 = new ExternalSorter[Int, Int, Int](
+      Some(agg), Some(new HashPartitioner(7)), None, None)
+    sorter2.insertAll(elements.iterator)
+    assert(sorter2.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter2.stop()
+
+    // Only ordering
+    val sorter3 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), Some(ord), None)
+    sorter3.insertAll(elements.iterator)
+    assert(sorter3.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter3.stop()
+
+    // Neither aggregator nor ordering
+    val sorter4 = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), None, None)
+    sorter4.insertAll(elements.iterator)
+    assert(sorter4.partitionedIterator.map(p => (p._1, p._2.toSet)).toSet === expected)
+    sorter4.stop()
+  }
+
+  private def emptyPartitionsWithSpilling(conf: SparkConf) {
+    val size = 1000
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
+    sc = new SparkContext("local", "test", conf)
+
+    val ord = implicitly[Ordering[Int]]
+    val elements = Iterator((1, 1), (5, 5)) ++ (0 until size).iterator.map(x => (2, 2))
+
+    val sorter = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(7)), Some(ord), None)
+    sorter.insertAll(elements)
+    assert(sorter.numSpills > 0, "sorter did not spill")
+    val iter = sorter.partitionedIterator.map(p => (p._1, p._2.toList))
+    assert(iter.next() === (0, Nil))
+    assert(iter.next() === (1, List((1, 1))))
+    assert(iter.next() === (2, (0 until 1000).map(x => (2, 2)).toList))
+    assert(iter.next() === (3, Nil))
+    assert(iter.next() === (4, Nil))
+    assert(iter.next() === (5, List((5, 5))))
+    assert(iter.next() === (6, Nil))
+    sorter.stop()
+  }
+
+  private def testSpillingInLocalCluster(conf: SparkConf, numReduceTasks: Int) {
+    val size = 5000
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
+
+    assertSpilled(sc, "reduceByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }
+        .reduceByKey(math.max _, numReduceTasks)
+        .collect()
+      assert(result.length === size / 2)
+      result.foreach { case (k, v) =>
+        val expected = k * 2 + 1
+        assert(v === expected, s"Value for $k was wrong: expected $expected, got $v")
+      }
+    }
+
+    assertSpilled(sc, "groupByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }
+        .groupByKey(numReduceTasks)
+        .collect()
+      assert(result.length == size / 2)
+      result.foreach { case (i, seq) =>
+        val actual = seq.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual === expected, s"Value for $i was wrong: expected $expected, got $actual")
+      }
+    }
+
+    assertSpilled(sc, "cogroup") {
+      val rdd1 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val rdd2 = sc.parallelize(0 until size).map { i => (i / 2, i) }
+      val result = rdd1.cogroup(rdd2, numReduceTasks).collect()
+      assert(result.length === size / 2)
+      result.foreach { case (i, (seq1, seq2)) =>
+        val actual1 = seq1.toSet
+        val actual2 = seq2.toSet
+        val expected = Set(i * 2, i * 2 + 1)
+        assert(actual1 === expected, s"Value 1 for $i was wrong: expected $expected, got $actual1")
+        assert(actual2 === expected, s"Value 2 for $i was wrong: expected $expected, got $actual2")
+      }
+    }
+
+    assertSpilled(sc, "sortByKey") {
+      val result = sc.parallelize(0 until size)
+        .map { i => (i / 2, i) }
+        .sortByKey(numPartitions = numReduceTasks)
+        .collect()
+      val expected = (0 until size).map { i => (i / 2, i) }.toArray
+      assert(result.length === size)
+      result.zipWithIndex.foreach { case ((k, _), i) =>
+        val (expectedKey, _) = expected(i)
+        assert(k === expectedKey, s"Value for $i was wrong: expected $expectedKey, got $k")
+      }
+    }
+  }
+
+  private def cleanupIntermediateFilesInSorter(withFailures: Boolean): Unit = {
+    val size = 1200
+    val conf = createSparkConf(loadDefaults = false, kryo = false)
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = sc.env.blockManager.diskBlockManager
+    val ord = implicitly[Ordering[Int]]
+    val expectedSize = if (withFailures) size - 1 else size
+    val sorter = new ExternalSorter[Int, Int, Int](
+      None, Some(new HashPartitioner(3)), Some(ord), None)
+    if (withFailures) {
+      intercept[SparkException] {
+        sorter.insertAll((0 until size).iterator.map { i =>
+          if (i == size - 1) { throw new SparkException("intentional failure") }
+          (i, i)
+        })
+      }
+    } else {
+      sorter.insertAll((0 until size).iterator.map(i => (i, i)))
+    }
+    assert(sorter.iterator.toSet === (0 until expectedSize).map(i => (i, i)).toSet)
+    assert(sorter.numSpills > 0, "sorter did not spill")
+    assert(diskBlockManager.getAllFiles().nonEmpty, "sorter did not spill")
+    sorter.stop()
+    assert(diskBlockManager.getAllFiles().isEmpty, "spilled files were not cleaned up")
+  }
+
+  private def cleanupIntermediateFilesInShuffle(withFailures: Boolean): Unit = {
+    val size = 1200
+    val conf = createSparkConf(loadDefaults = false, kryo = false)
+    conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 4).toString)
+    sc = new SparkContext("local", "test", conf)
+    val diskBlockManager = sc.env.blockManager.diskBlockManager
+    val data = sc.parallelize(0 until size, 2).map { i =>
+      if (withFailures && i == size - 1) {
+        throw new SparkException("intentional failure")
+      }
+      (i, i)
+    }
+
+    assertSpilled(sc, "test shuffle cleanup") {
+      if (withFailures) {
+        intercept[SparkException] {
+          data.reduceByKey(_ + _).count()
+        }
+        // After the shuffle, there should be only 2 files on disk: the output of task 1 and
+        // its index. All other files (map 2's output and intermediate merge files) should
+        // have been deleted.
+        assert(diskBlockManager.getAllFiles().length === 2)
+      } else {
+        assert(data.reduceByKey(_ + _).count() === size)
+        // After the shuffle, there should be only 4 files on disk: the output of both tasks
+        // and their indices. All intermediate merge files should have been deleted.
+        assert(diskBlockManager.getAllFiles().length === 4)
+      }
+    }
+  }
+
+  private def basicSorterTest(
+      conf: SparkConf,
+      withPartialAgg: Boolean,
+      withOrdering: Boolean,
+      withSpilling: Boolean) {
+    val size = 1000
+    if (withSpilling) {
+      conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
+    }
+    conf.set("spark.shuffle.manager", "sort")
+    sc = new SparkContext("local", "test", conf)
+    val agg =
+      if (withPartialAgg) {
+        Some(new Aggregator[Int, Int, Int](i => i, (i, j) => i + j, (i, j) => i + j))
+      } else {
+        None
+      }
+    val ord = if (withOrdering) Some(implicitly[Ordering[Int]]) else None
+    val sorter = new ExternalSorter[Int, Int, Int](agg, Some(new HashPartitioner(3)), ord, None)
+    sorter.insertAll((0 until size).iterator.map { i => (i / 4, i) })
+    if (withSpilling) {
+      assert(sorter.numSpills > 0, "sorter did not spill")
+    } else {
+      assert(sorter.numSpills === 0, "sorter spilled")
+    }
+    val results = sorter.partitionedIterator.map { case (p, vs) => (p, vs.toSet) }.toSet
+    val expected = (0 until 3).map { p =>
+      var v = (0 until size).map { i => (i / 4, i) }.filter { case (k, _) => k % 3 == p }.toSet
+      if (withPartialAgg) {
+        v = v.groupBy(_._1).mapValues { s => s.map(_._2).sum }.toSet
+      }
+      (p, v.toSet)
+    }.toSet
+    assert(results === expected)
   }
 
   private def sortWithoutBreakingSortingContracts(conf: SparkConf) {
+    val size = 100000
+    val conf = createSparkConf(loadDefaults = true, kryo = false)
     conf.set("spark.shuffle.manager", "sort")
+    conf.set("spark.shuffle.spill.numElementsForceSpillThreshold", (size / 2).toString)
     sc = new SparkContext("local-cluster[1,1,1024]", "test", conf)
 
     // Using wrongOrdering to show integer overflow introduced exception.
@@ -690,17 +536,18 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
       }
     }
 
-    val testData = Array.tabulate(100000) { _ => rand.nextInt().toString }
+    val testData = Array.tabulate(size) { _ => rand.nextInt().toString }
 
     val sorter1 = new ExternalSorter[String, String, String](
       None, None, Some(wrongOrdering), None)
     val thrown = intercept[IllegalArgumentException] {
       sorter1.insertAll(testData.iterator.map(i => (i, i)))
+      assert(sorter1.numSpills > 0, "sorter did not spill")
       sorter1.iterator
     }
 
-    assert(thrown.getClass() === classOf[IllegalArgumentException])
-    assert(thrown.getMessage().contains("Comparison method violates its general contract"))
+    assert(thrown.getClass === classOf[IllegalArgumentException])
+    assert(thrown.getMessage.contains("Comparison method violates its general contract"))
     sorter1.stop()
 
     // Using aggregation and external spill to make sure ExternalSorter using
@@ -716,6 +563,7 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
     val sorter2 = new ExternalSorter[String, String, ArrayBuffer[String]](
       Some(agg), None, None, None)
     sorter2.insertAll(testData.iterator.map(i => (i, i)))
+    assert(sorter2.numSpills > 0, "sorter did not spill")
 
     // To validate the hash ordering of key
     var minKey = Int.MinValue
@@ -729,12 +577,23 @@ class ExternalSorterSuite extends SparkFunSuite with LocalSparkContext {
   }
 
   test("sorting updates peak execution memory") {
+    val spillThreshold = 1000
     val conf = createSparkConf(loadDefaults = false, kryo = false)
       .set("spark.shuffle.manager", "sort")
+      .set("spark.shuffle.spill.numElementsForceSpillThreshold", spillThreshold.toString)
     sc = new SparkContext("local", "test", conf)
     // Avoid aggregating here to make sure we're not also using ExternalAppendOnlyMap
-    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sorter") {
-      sc.parallelize(1 to 1000, 2).repartition(100).count()
+    // No spilling
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sorter without spilling") {
+      assertNotSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold / 2, 2).repartition(100).count()
+      }
+    }
+    // With spilling
+    AccumulatorSuite.verifyPeakExecutionMemorySet(sc, "external sorter with spilling") {
+      assertSpilled(sc, "verify peak memory") {
+        sc.parallelize(1 to spillThreshold * 3, 2).repartition(100).count()
+      }
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
index 835f52fa566a2..c4358f409b6ef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/TestShuffleMemoryManager.scala
@@ -68,6 +68,8 @@ private class GrantEverythingMemoryManager extends MemoryManager {
       blockId: BlockId,
       numBytes: Long,
       evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true
+  override def releaseExecutionMemory(numBytes: Long): Unit = { }
+  override def releaseStorageMemory(numBytes: Long): Unit = { }
   override def maxExecutionMemory: Long = Long.MaxValue
   override def maxStorageMemory: Long = Long.MaxValue
 }

From 6a2359ff1f7ad2233af2c530313d6ec2ecf70d19 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 15 Oct 2015 14:50:58 -0700
Subject: [PATCH 070/139] [SPARK-10412] [SQL] report memory usage for tungsten
 sql physical operator

https://issues.apache.org/jira/browse/SPARK-10412

some screenshots:
### aggregate:
![screen shot 2015-10-12 at 2 23 11 pm](https://cloud.githubusercontent.com/assets/3182036/10439534/618320a4-70ef-11e5-94d8-62ea7f2d1531.png)

### join
![screen shot 2015-10-12 at 2 23 29 pm](https://cloud.githubusercontent.com/assets/3182036/10439537/6724797c-70ef-11e5-8f75-0cf5cbd42048.png)

Author: Wenchen Fan <wenchen@databricks.com>
Author: Wenchen Fan <cloud0fan@163.com>

Closes #8931 from cloud-fan/viz.
---
 .../aggregate/TungstenAggregate.scala         | 10 ++-
 .../TungstenAggregationIterator.scala         | 10 ++-
 .../sql/execution/metric/SQLMetrics.scala     | 72 +++++++++++++------
 .../org/apache/spark/sql/execution/sort.scala | 16 +++++
 .../sql/execution/ui/ExecutionPage.scala      |  2 +-
 .../spark/sql/execution/ui/SQLListener.scala  |  9 ++-
 .../sql/execution/ui/SparkPlanGraph.scala     |  4 +-
 .../TungstenAggregationIteratorSuite.scala    |  3 +-
 .../execution/metric/SQLMetricsSuite.scala    | 13 +++-
 .../sql/execution/ui/SQLListenerSuite.scala   | 20 +++---
 10 files changed, 116 insertions(+), 43 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
index c342940e6e757..0d3a4b36c161b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregate.scala
@@ -49,7 +49,9 @@ case class TungstenAggregate(
 
   override private[sql] lazy val metrics = Map(
     "numInputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of input rows"),
-    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))
+    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"),
+    "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"),
+    "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"))
 
   override def outputsUnsafeRows: Boolean = true
 
@@ -79,6 +81,8 @@ case class TungstenAggregate(
   protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
     val numInputRows = longMetric("numInputRows")
     val numOutputRows = longMetric("numOutputRows")
+    val dataSize = longMetric("dataSize")
+    val spillSize = longMetric("spillSize")
 
     /**
      * Set up the underlying unsafe data structures used before computing the parent partition.
@@ -97,7 +101,9 @@ case class TungstenAggregate(
         child.output,
         testFallbackStartsAt,
         numInputRows,
-        numOutputRows)
+        numOutputRows,
+        dataSize,
+        spillSize)
     }
 
     /** Compute a partition using the iterator already set up previously. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
index fe708a5f71f79..7cd0f7b81e46c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIterator.scala
@@ -87,7 +87,9 @@ class TungstenAggregationIterator(
     originalInputAttributes: Seq[Attribute],
     testFallbackStartsAt: Option[Int],
     numInputRows: LongSQLMetric,
-    numOutputRows: LongSQLMetric)
+    numOutputRows: LongSQLMetric,
+    dataSize: LongSQLMetric,
+    spillSize: LongSQLMetric)
   extends Iterator[UnsafeRow] with Logging {
 
   // The parent partition iterator, to be initialized later in `start`
@@ -110,6 +112,10 @@ class TungstenAggregationIterator(
       s"$allAggregateExpressions should have no more than 2 kinds of modes.")
   }
 
+  // Remember spill data size of this task before execute this operator so that we can
+  // figure out how many bytes we spilled for this operator.
+  private val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled
+
   //
   // The modes of AggregateExpressions. Right now, we can handle the following mode:
   //  - Partial-only:
@@ -842,6 +848,8 @@ class TungstenAggregationIterator(
         val mapMemory = hashMap.getPeakMemoryUsedBytes
         val sorterMemory = Option(externalSorter).map(_.getPeakMemoryUsedBytes).getOrElse(0L)
         val peakMemory = Math.max(mapMemory, sorterMemory)
+        dataSize += peakMemory
+        spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore
         TaskContext.get().internalMetricsToAccumulators(
           InternalAccumulator.PEAK_EXECUTION_MEMORY).add(peakMemory)
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
index 7a2a98ec18cb8..075b7ad881112 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/metric/SQLMetrics.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.metric
 
+import org.apache.spark.util.Utils
 import org.apache.spark.{Accumulable, AccumulableParam, SparkContext}
 
 /**
@@ -35,6 +36,12 @@ private[sql] abstract class SQLMetric[R <: SQLMetricValue[T], T](
  */
 private[sql] trait SQLMetricParam[R <: SQLMetricValue[T], T] extends AccumulableParam[R, T] {
 
+  /**
+   * A function that defines how we aggregate the final accumulator results among all tasks,
+   * and represent it in string for a SQL physical operator.
+   */
+  val stringValue: Seq[T] => String
+
   def zero: R
 }
 
@@ -63,26 +70,12 @@ private[sql] class LongSQLMetricValue(private var _value : Long) extends SQLMetr
   override def value: Long = _value
 }
 
-/**
- * A wrapper of Int to avoid boxing and unboxing when using Accumulator
- */
-private[sql] class IntSQLMetricValue(private var _value: Int) extends SQLMetricValue[Int] {
-
-  def add(term: Int): IntSQLMetricValue = {
-    _value += term
-    this
-  }
-
-  // Although there is a boxing here, it's fine because it's only called in SQLListener
-  override def value: Int = _value
-}
-
 /**
  * A specialized long Accumulable to avoid boxing and unboxing when using Accumulator's
  * `+=` and `add`.
  */
-private[sql] class LongSQLMetric private[metric](name: String)
-  extends SQLMetric[LongSQLMetricValue, Long](name, LongSQLMetricParam) {
+private[sql] class LongSQLMetric private[metric](name: String, param: LongSQLMetricParam)
+  extends SQLMetric[LongSQLMetricValue, Long](name, param) {
 
   override def +=(term: Long): Unit = {
     localValue.add(term)
@@ -93,7 +86,8 @@ private[sql] class LongSQLMetric private[metric](name: String)
   }
 }
 
-private object LongSQLMetricParam extends SQLMetricParam[LongSQLMetricValue, Long] {
+private class LongSQLMetricParam(val stringValue: Seq[Long] => String, initialValue: Long)
+  extends SQLMetricParam[LongSQLMetricValue, Long] {
 
   override def addAccumulator(r: LongSQLMetricValue, t: Long): LongSQLMetricValue = r.add(t)
 
@@ -102,20 +96,56 @@ private object LongSQLMetricParam extends SQLMetricParam[LongSQLMetricValue, Lon
 
   override def zero(initialValue: LongSQLMetricValue): LongSQLMetricValue = zero
 
-  override def zero: LongSQLMetricValue = new LongSQLMetricValue(0L)
+  override def zero: LongSQLMetricValue = new LongSQLMetricValue(initialValue)
 }
 
 private[sql] object SQLMetrics {
 
-  def createLongMetric(sc: SparkContext, name: String): LongSQLMetric = {
-    val acc = new LongSQLMetric(name)
+  private def createLongMetric(
+      sc: SparkContext,
+      name: String,
+      stringValue: Seq[Long] => String,
+      initialValue: Long): LongSQLMetric = {
+    val param = new LongSQLMetricParam(stringValue, initialValue)
+    val acc = new LongSQLMetric(name, param)
     sc.cleaner.foreach(_.registerAccumulatorForCleanup(acc))
     acc
   }
 
+  def createLongMetric(sc: SparkContext, name: String): LongSQLMetric = {
+    createLongMetric(sc, name, _.sum.toString, 0L)
+  }
+
+  /**
+   * Create a metric to report the size information (including total, min, med, max) like data size,
+   * spill size, etc.
+   */
+  def createSizeMetric(sc: SparkContext, name: String): LongSQLMetric = {
+    val stringValue = (values: Seq[Long]) => {
+      // This is a workaround for SPARK-11013.
+      // We use -1 as initial value of the accumulator, if the accumulator is valid, we will update
+      // it at the end of task and the value will be at least 0.
+      val validValues = values.filter(_ >= 0)
+      val Seq(sum, min, med, max) = {
+        val metric = if (validValues.length == 0) {
+          Seq.fill(4)(0L)
+        } else {
+          val sorted = validValues.sorted
+          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
+        }
+        metric.map(Utils.bytesToString)
+      }
+      s"\n$sum ($min, $med, $max)"
+    }
+    // The final result of this metric in physical operator UI may looks like:
+    // data size total (min, med, max):
+    // 100GB (100MB, 1GB, 10GB)
+    createLongMetric(sc, s"$name total (min, med, max)", stringValue, -1L)
+  }
+
   /**
    * A metric that its value will be ignored. Use this one when we need a metric parameter but don't
    * care about the value.
    */
-  val nullLongMetric = new LongSQLMetric("null")
+  val nullLongMetric = new LongSQLMetric("null", new LongSQLMetricParam(_.sum.toString, 0L))
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
index 27f26245a5ef0..9385e5734db5c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/sort.scala
@@ -22,6 +22,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.errors._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution}
+import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.CompletionIterator
 import org.apache.spark.util.collection.ExternalSorter
@@ -93,10 +94,17 @@ case class TungstenSort(
   override def requiredChildDistribution: Seq[Distribution] =
     if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil
 
+  override private[sql] lazy val metrics = Map(
+    "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"),
+    "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"))
+
   protected override def doExecute(): RDD[InternalRow] = {
     val schema = child.schema
     val childOutput = child.output
 
+    val dataSize = longMetric("dataSize")
+    val spillSize = longMetric("spillSize")
+
     /**
      * Set up the sorter in each partition before computing the parent partition.
      * This makes sure our sorter is not starved by other sorters used in the same task.
@@ -131,7 +139,15 @@ case class TungstenSort(
         partitionIndex: Int,
         sorter: UnsafeExternalRowSorter,
         parentIterator: Iterator[InternalRow]): Iterator[InternalRow] = {
+      // Remember spill data size of this task before execute this operator so that we can
+      // figure out how many bytes we spilled for this operator.
+      val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled
+
       val sortedIterator = sorter.sort(parentIterator.asInstanceOf[Iterator[UnsafeRow]])
+
+      dataSize += sorter.getPeakMemoryUsage
+      spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore
+
       taskContext.internalMetricsToAccumulators(
         InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.getPeakMemoryUsage)
       sortedIterator
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
index a4dbd2e1978d0..e74d6fb396e1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/ExecutionPage.scala
@@ -100,7 +100,7 @@ private[sql] class ExecutionPage(parent: SQLTab) extends WebUIPage("execution")
     // scalastyle:on
   }
 
-  private def planVisualization(metrics: Map[Long, Any], graph: SparkPlanGraph): Seq[Node] = {
+  private def planVisualization(metrics: Map[Long, String], graph: SparkPlanGraph): Seq[Node] = {
     val metadata = graph.nodes.flatMap { node =>
       val nodeId = s"plan-meta-data-${node.id}"
       <div id={nodeId}>{node.desc}</div>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index d6472400a6a21..b302b519998ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -252,7 +252,7 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
   /**
    * Get all accumulator updates from all tasks which belong to this execution and merge them.
    */
-  def getExecutionMetrics(executionId: Long): Map[Long, Any] = synchronized {
+  def getExecutionMetrics(executionId: Long): Map[Long, String] = synchronized {
     _executionIdToData.get(executionId) match {
       case Some(executionUIData) =>
         val accumulatorUpdates = {
@@ -264,8 +264,7 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
           }
         }.filter { case (id, _) => executionUIData.accumulatorMetrics.contains(id) }
         mergeAccumulatorUpdates(accumulatorUpdates, accumulatorId =>
-          executionUIData.accumulatorMetrics(accumulatorId).metricParam).
-          mapValues(_.asInstanceOf[SQLMetricValue[_]].value)
+          executionUIData.accumulatorMetrics(accumulatorId).metricParam)
       case None =>
         // This execution has been dropped
         Map.empty
@@ -274,11 +273,11 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
 
   private def mergeAccumulatorUpdates(
       accumulatorUpdates: Seq[(Long, Any)],
-      paramFunc: Long => SQLMetricParam[SQLMetricValue[Any], Any]): Map[Long, Any] = {
+      paramFunc: Long => SQLMetricParam[SQLMetricValue[Any], Any]): Map[Long, String] = {
     accumulatorUpdates.groupBy(_._1).map { case (accumulatorId, values) =>
       val param = paramFunc(accumulatorId)
       (accumulatorId,
-        values.map(_._2.asInstanceOf[SQLMetricValue[Any]]).foldLeft(param.zero)(param.addInPlace))
+        param.stringValue(values.map(_._2.asInstanceOf[SQLMetricValue[Any]].value)))
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
index ae3d752dde348..f1fce5478a3fe 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SparkPlanGraph.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.execution.metric.{SQLMetricParam, SQLMetricValue}
 private[ui] case class SparkPlanGraph(
     nodes: Seq[SparkPlanGraphNode], edges: Seq[SparkPlanGraphEdge]) {
 
-  def makeDotFile(metrics: Map[Long, Any]): String = {
+  def makeDotFile(metrics: Map[Long, String]): String = {
     val dotFile = new StringBuilder
     dotFile.append("digraph G {\n")
     nodes.foreach(node => dotFile.append(node.makeDotNode(metrics) + "\n"))
@@ -87,7 +87,7 @@ private[sql] object SparkPlanGraph {
 private[ui] case class SparkPlanGraphNode(
     id: Long, name: String, desc: String, metrics: Seq[SQLPlanMetric]) {
 
-  def makeDotNode(metricsValue: Map[Long, Any]): String = {
+  def makeDotNode(metricsValue: Map[Long, String]): String = {
     val values = {
       for (metric <- metrics;
            value <- metricsValue.get(metric.accumulatorId)) yield {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
index 0cc4988ff681c..cc0ac1b07c21a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/aggregate/TungstenAggregationIteratorSuite.scala
@@ -39,7 +39,8 @@ class TungstenAggregationIteratorSuite extends SparkFunSuite with SharedSQLConte
       }
       val dummyAccum = SQLMetrics.createLongMetric(sparkContext, "dummy")
       iter = new TungstenAggregationIterator(Seq.empty, Seq.empty, Seq.empty, Seq.empty, Seq.empty,
-        0, Seq.empty, newMutableProjection, Seq.empty, None, dummyAccum, dummyAccum)
+        0, Seq.empty, newMutableProjection, Seq.empty, None,
+        dummyAccum, dummyAccum, dummyAccum, dummyAccum)
       val numPages = iter.getHashMap.getNumDataPages
       assert(numPages === 1)
     } finally {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 6afffae161ef6..cdd885ba14203 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -93,7 +93,16 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
         }.toMap
         (node.id, node.name -> nodeMetrics)
       }.toMap
-      assert(expectedMetrics === actualMetrics)
+
+      assert(expectedMetrics.keySet === actualMetrics.keySet)
+      for (nodeId <- expectedMetrics.keySet) {
+        val (expectedNodeName, expectedMetricsMap) = expectedMetrics(nodeId)
+        val (actualNodeName, actualMetricsMap) = actualMetrics(nodeId)
+        assert(expectedNodeName === actualNodeName)
+        for (metricName <- expectedMetricsMap.keySet) {
+          assert(expectedMetricsMap(metricName).toString === actualMetricsMap(metricName))
+        }
+      }
     } else {
       // TODO Remove this "else" once we fix the race condition that missing the JobStarted event.
       // Since we cannot track all jobs, the metric values could be wrong and we should not check
@@ -489,7 +498,7 @@ class SQLMetricsSuite extends SparkFunSuite with SharedSQLContext {
       val metricValues = sqlContext.listener.getExecutionMetrics(executionId)
       // Because "save" will create a new DataFrame internally, we cannot get the real metric id.
       // However, we still can check the value.
-      assert(metricValues.values.toSeq === Seq(2L))
+      assert(metricValues.values.toSeq === Seq("2"))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 727cf3665a871..cc1c1e10e98c4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -74,6 +74,10 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   }
 
   test("basic") {
+    def checkAnswer(actual: Map[Long, String], expected: Map[Long, Long]): Unit = {
+      assert(actual === expected.mapValues(_.toString))
+    }
+
     val listener = new SQLListener(sqlContext.sparkContext.conf)
     val executionId = 0
     val df = createTestDataFrame
@@ -114,7 +118,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 0, 0, createTaskMetrics(accumulatorUpdates))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
 
     listener.onExecutorMetricsUpdate(SparkListenerExecutorMetricsUpdate("", Seq(
       // (task id, stage id, stage attempt, metrics)
@@ -122,7 +126,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 0, 0, createTaskMetrics(accumulatorUpdates.mapValues(_ * 2)))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 3))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 3))
 
     // Retrying a stage should reset the metrics
     listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(0, 1)))
@@ -133,7 +137,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 0, 1, createTaskMetrics(accumulatorUpdates))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
 
     // Ignore the task end for the first attempt
     listener.onTaskEnd(SparkListenerTaskEnd(
@@ -144,7 +148,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       createTaskInfo(0, 0),
       createTaskMetrics(accumulatorUpdates.mapValues(_ * 100))))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 2))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 2))
 
     // Finish two tasks
     listener.onTaskEnd(SparkListenerTaskEnd(
@@ -162,7 +166,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       createTaskInfo(1, 0),
       createTaskMetrics(accumulatorUpdates.mapValues(_ * 3))))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 5))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 5))
 
     // Summit a new stage
     listener.onStageSubmitted(SparkListenerStageSubmitted(createStageInfo(1, 0)))
@@ -173,7 +177,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       (1L, 1, 0, createTaskMetrics(accumulatorUpdates))
     )))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 7))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 7))
 
     // Finish two tasks
     listener.onTaskEnd(SparkListenerTaskEnd(
@@ -191,7 +195,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       createTaskInfo(1, 0),
       createTaskMetrics(accumulatorUpdates.mapValues(_ * 3))))
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 11))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 11))
 
     assert(executionUIData.runningJobs === Seq(0))
     assert(executionUIData.succeededJobs.isEmpty)
@@ -208,7 +212,7 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     assert(executionUIData.succeededJobs === Seq(0))
     assert(executionUIData.failedJobs.isEmpty)
 
-    assert(listener.getExecutionMetrics(0) === accumulatorUpdates.mapValues(_ * 11))
+    checkAnswer(listener.getExecutionMetrics(0), accumulatorUpdates.mapValues(_ * 11))
   }
 
   test("onExecutionEnd happens before onJobEnd(JobSucceeded)") {

From eb0b4d6e2ddfb765f082d0d88472626336ad2609 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 15 Oct 2015 17:36:55 -0700
Subject: [PATCH 071/139] [SPARK-11135] [SQL] Exchange incorrectly skips sorts
 when existing ordering is non-empty subset of required ordering

In Spark SQL, the Exchange planner tries to avoid unnecessary sorts in cases where the data has already been sorted by a superset of the requested sorting columns. For instance, let's say that a query calls for an operator's input to be sorted by `a.asc` and the input happens to already be sorted by `[a.asc, b.asc]`. In this case, we do not need to re-sort the input. The converse, however, is not true: if the query calls for `[a.asc, b.asc]`, then `a.asc` alone will not satisfy the ordering requirements, requiring an additional sort to be planned by Exchange.

However, the current Exchange code gets this wrong and incorrectly skips sorting when the existing output ordering is a subset of the required ordering. This is simple to fix, however.

This bug was introduced in https://github.com/apache/spark/pull/7458, so it affects 1.5.0+.

This patch fixes the bug and significantly improves the unit test coverage of Exchange's sort-planning logic.

Author: Josh Rosen <joshrosen@databricks.com>

Closes #9140 from JoshRosen/SPARK-11135.
---
 .../apache/spark/sql/execution/Exchange.scala |  5 +-
 .../spark/sql/execution/PlannerSuite.scala    | 49 +++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
index 289453753f18d..1d3379a5e2d91 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Exchange.scala
@@ -219,6 +219,8 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
     val requiredChildDistributions: Seq[Distribution] = operator.requiredChildDistribution
     val requiredChildOrderings: Seq[Seq[SortOrder]] = operator.requiredChildOrdering
     var children: Seq[SparkPlan] = operator.children
+    assert(requiredChildDistributions.length == children.length)
+    assert(requiredChildOrderings.length == children.length)
 
     // Ensure that the operator's children satisfy their output distribution requirements:
     children = children.zip(requiredChildDistributions).map { case (child, distribution) =>
@@ -248,8 +250,7 @@ private[sql] case class EnsureRequirements(sqlContext: SQLContext) extends Rule[
     children = children.zip(requiredChildOrderings).map { case (child, requiredOrdering) =>
       if (requiredOrdering.nonEmpty) {
         // If child.outputOrdering is [a, b] and requiredOrdering is [a], we do not need to sort.
-        val minSize = Seq(requiredOrdering.size, child.outputOrdering.size).min
-        if (minSize == 0 || requiredOrdering.take(minSize) != child.outputOrdering.take(minSize)) {
+        if (requiredOrdering != child.outputOrdering.take(requiredOrdering.length)) {
           sqlContext.planner.BasicOperators.getSortOperator(requiredOrdering, global = false, child)
         } else {
           child
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
index cafa1d5154788..ebdab1c26d7bd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/PlannerSuite.scala
@@ -354,6 +354,55 @@ class PlannerSuite extends SharedSQLContext {
     }
   }
 
+  test("EnsureRequirements adds sort when there is no existing ordering") {
+    val orderingA = SortOrder(Literal(1), Ascending)
+    val orderingB = SortOrder(Literal(2), Ascending)
+    assert(orderingA != orderingB)
+    val inputPlan = DummySparkPlan(
+      children = DummySparkPlan(outputOrdering = Seq.empty) :: Nil,
+      requiredChildOrdering = Seq(Seq(orderingB)),
+      requiredChildDistribution = Seq(UnspecifiedDistribution)
+    )
+    val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
+    assertDistributionRequirementsAreSatisfied(outputPlan)
+    if (outputPlan.collect { case s: TungstenSort => true; case s: Sort => true }.isEmpty) {
+      fail(s"Sort should have been added:\n$outputPlan")
+    }
+  }
+
+  test("EnsureRequirements skips sort when required ordering is prefix of existing ordering") {
+    val orderingA = SortOrder(Literal(1), Ascending)
+    val orderingB = SortOrder(Literal(2), Ascending)
+    assert(orderingA != orderingB)
+    val inputPlan = DummySparkPlan(
+      children = DummySparkPlan(outputOrdering = Seq(orderingA, orderingB)) :: Nil,
+      requiredChildOrdering = Seq(Seq(orderingA)),
+      requiredChildDistribution = Seq(UnspecifiedDistribution)
+    )
+    val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
+    assertDistributionRequirementsAreSatisfied(outputPlan)
+    if (outputPlan.collect { case s: TungstenSort => true; case s: Sort => true }.nonEmpty) {
+      fail(s"No sorts should have been added:\n$outputPlan")
+    }
+  }
+
+  // This is a regression test for SPARK-11135
+  test("EnsureRequirements adds sort when required ordering isn't a prefix of existing ordering") {
+    val orderingA = SortOrder(Literal(1), Ascending)
+    val orderingB = SortOrder(Literal(2), Ascending)
+    assert(orderingA != orderingB)
+    val inputPlan = DummySparkPlan(
+      children = DummySparkPlan(outputOrdering = Seq(orderingA)) :: Nil,
+      requiredChildOrdering = Seq(Seq(orderingA, orderingB)),
+      requiredChildDistribution = Seq(UnspecifiedDistribution)
+    )
+    val outputPlan = EnsureRequirements(sqlContext).apply(inputPlan)
+    assertDistributionRequirementsAreSatisfied(outputPlan)
+    if (outputPlan.collect { case s: TungstenSort => true; case s: Sort => true }.isEmpty) {
+      fail(s"Sort should have been added:\n$outputPlan")
+    }
+  }
+
   // ---------------------------------------------------------------------------------------------
 }
 

From 43f5d1f326d7a2a4a78fe94853d0d05237568203 Mon Sep 17 00:00:00 2001
From: jerryshao <sshao@hortonworks.com>
Date: Fri, 16 Oct 2015 11:53:47 +0100
Subject: [PATCH 072/139] [SPARK-11060] [STREAMING] Fix some potential NPE in
 DStream transformation

This patch fixes:

1. Guard out against NPEs in `TransformedDStream` when parent DStream returns None instead of empty RDD.
2. Verify some input streams which will potentially return None.
3. Add unit test to verify the behavior when input stream returns None.

cc tdas , please help to review, thanks a lot :).

Author: jerryshao <sshao@hortonworks.com>

Closes #9070 from jerryshao/SPARK-11060.
---
 .../dstream/ConstantInputDStream.scala        |  6 +-
 .../streaming/dstream/QueueInputDStream.scala |  2 +-
 .../dstream/TransformedDStream.scala          |  7 +-
 .../streaming/dstream/UnionDStream.scala      | 11 ++--
 .../streaming/BasicOperationsSuite.scala      | 66 +++++++++++++++++++
 5 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala
index f396c347581ce..4eb92dd8b1053 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/ConstantInputDStream.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.streaming.dstream
 
+import scala.reflect.ClassTag
+
 import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Time, StreamingContext}
-import scala.reflect.ClassTag
 
 /**
  * An input stream that always returns the same RDD on each timestep. Useful for testing.
@@ -27,6 +28,9 @@ import scala.reflect.ClassTag
 class ConstantInputDStream[T: ClassTag](ssc_ : StreamingContext, rdd: RDD[T])
   extends InputDStream[T](ssc_) {
 
+  require(rdd != null,
+    "parameter rdd null is illegal, which will lead to NPE in the following transformation")
+
   override def start() {}
 
   override def stop() {}
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
index a2685046e03d4..cd073646370d0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/QueueInputDStream.scala
@@ -62,7 +62,7 @@ class QueueInputDStream[T: ClassTag](
     } else if (defaultRDD != null) {
       Some(defaultRDD)
     } else {
-      None
+      Some(ssc.sparkContext.emptyRDD)
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
index ab01f47d5cf99..5eabdf63dc8d7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/TransformedDStream.scala
@@ -20,7 +20,7 @@ package org.apache.spark.streaming.dstream
 import scala.reflect.ClassTag
 
 import org.apache.spark.SparkException
-import org.apache.spark.rdd.{PairRDDFunctions, RDD}
+import org.apache.spark.rdd.RDD
 import org.apache.spark.streaming.{Duration, Time}
 
 private[streaming]
@@ -39,7 +39,10 @@ class TransformedDStream[U: ClassTag] (
   override def slideDuration: Duration = parents.head.slideDuration
 
   override def compute(validTime: Time): Option[RDD[U]] = {
-    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
+    val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
+      // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
+      throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
+    }
     val transformedRDD = transformFunc(parentRDDs, validTime)
     if (transformedRDD == null) {
       throw new SparkException("Transform function must not return null. " +
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala
index 9405dbaa12329..d73ffdfd84d2d 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/UnionDStream.scala
@@ -17,13 +17,14 @@
 
 package org.apache.spark.streaming.dstream
 
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+import org.apache.spark.SparkException
 import org.apache.spark.streaming.{Duration, Time}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.UnionRDD
 
-import scala.collection.mutable.ArrayBuffer
-import scala.reflect.ClassTag
-
 private[streaming]
 class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
   extends DStream[T](parents.head.ssc) {
@@ -41,8 +42,8 @@ class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
     val rdds = new ArrayBuffer[RDD[T]]()
     parents.map(_.getOrCompute(validTime)).foreach {
       case Some(rdd) => rdds += rdd
-      case None => throw new Exception("Could not generate RDD from a parent for unifying at time "
-        + validTime)
+      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
+        s" time $validTime")
     }
     if (rdds.size > 0) {
       Some(new UnionRDD(ssc.sc, rdds))
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
index 9988f410f0bc1..9d296c6d3ef8b 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/BasicOperationsSuite.scala
@@ -191,6 +191,20 @@ class BasicOperationsSuite extends TestSuiteBase {
     )
   }
 
+  test("union with input stream return None") {
+    val input = Seq(1 to 4, 101 to 104, 201 to 204, null)
+    val output = Seq(1 to 8, 101 to 108, 201 to 208)
+    intercept[SparkException] {
+      testOperation(
+        input,
+        (s: DStream[Int]) => s.union(s.map(_ + 4)),
+        output,
+        input.length,
+        false
+      )
+    }
+  }
+
   test("StreamingContext.union") {
     val input = Seq(1 to 4, 101 to 104, 201 to 204)
     val output = Seq(1 to 12, 101 to 112, 201 to 212)
@@ -224,6 +238,19 @@ class BasicOperationsSuite extends TestSuiteBase {
     }
   }
 
+  test("transform with input stream return None") {
+    val input = Seq(1 to 4, 5 to 8, null)
+    intercept[SparkException] {
+      testOperation(
+        input,
+        (r: DStream[Int]) => r.transform(rdd => rdd.map(_.toString)),
+        input.filterNot(_ == null).map(_.map(_.toString)),
+        input.length,
+        false
+      )
+    }
+  }
+
   test("transformWith") {
     val inputData1 = Seq( Seq("a", "b"), Seq("a", ""), Seq(""), Seq() )
     val inputData2 = Seq( Seq("a", "b"), Seq("b", ""), Seq(), Seq("")   )
@@ -244,6 +271,27 @@ class BasicOperationsSuite extends TestSuiteBase {
     testOperation(inputData1, inputData2, operation, outputData, true)
   }
 
+  test("transformWith with input stream return None") {
+    val inputData1 = Seq( Seq("a", "b"), Seq("a", ""), Seq(""), null )
+    val inputData2 = Seq( Seq("a", "b"), Seq("b", ""), Seq(), null )
+    val outputData = Seq(
+      Seq("a", "b", "a", "b"),
+      Seq("a", "b", "", ""),
+      Seq("")
+    )
+
+    val operation = (s1: DStream[String], s2: DStream[String]) => {
+      s1.transformWith(           // RDD.join in transform
+        s2,
+        (rdd1: RDD[String], rdd2: RDD[String]) => rdd1.union(rdd2)
+      )
+    }
+
+    intercept[SparkException] {
+      testOperation(inputData1, inputData2, operation, outputData, inputData1.length, true)
+    }
+  }
+
   test("StreamingContext.transform") {
     val input = Seq(1 to 4, 101 to 104, 201 to 204)
     val output = Seq(1 to 12, 101 to 112, 201 to 212)
@@ -260,6 +308,24 @@ class BasicOperationsSuite extends TestSuiteBase {
     testOperation(input, operation, output)
   }
 
+  test("StreamingContext.transform with input stream return None") {
+    val input = Seq(1 to 4, 101 to 104, 201 to 204, null)
+    val output = Seq(1 to 12, 101 to 112, 201 to 212)
+
+    // transform over 3 DStreams by doing union of the 3 RDDs
+    val operation = (s: DStream[Int]) => {
+      s.context.transform(
+        Seq(s, s.map(_ + 4), s.map(_ + 8)),   // 3 DStreams
+        (rdds: Seq[RDD[_]], time: Time) =>
+          rdds.head.context.union(rdds.map(_.asInstanceOf[RDD[Int]]))  // union of RDDs
+      )
+    }
+
+    intercept[SparkException] {
+      testOperation(input, operation, output, input.length, false)
+    }
+  }
+
   test("cogroup") {
     val inputData1 = Seq( Seq("a", "a", "b"), Seq("a", ""), Seq(""), Seq() )
     val inputData2 = Seq( Seq("a", "a", "b"), Seq("b", ""), Seq(), Seq()   )

From ed775042cceb61a0566502e1306ac3c70f4a6a5f Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Fri, 16 Oct 2015 12:03:05 +0100
Subject: [PATCH 073/139] [SPARK-11092] [DOCS] Add source links to scaladoc
 generation

Modify the SBT build script to include GitHub source links for generated Scaladocs, on releases only (no snapshots).

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9110 from jodersky/unidoc.
---
 project/SparkBuild.scala | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 1339980c38800..8f0f310ddd24e 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -156,6 +156,10 @@ object SparkBuild extends PomBuild {
 
     javacOptions in Compile ++= Seq("-encoding", "UTF-8"),
 
+    scalacOptions in Compile ++= Seq(
+      "-sourcepath", (baseDirectory in ThisBuild).value.getAbsolutePath  // Required for relative source links in scaladoc
+    ),
+
     // Implements -Xfatal-warnings, ignoring deprecation warnings.
     // Code snippet taken from https://issues.scala-lang.org/browse/SI-8410.
     compile in Compile := {
@@ -489,6 +493,8 @@ object Unidoc {
       .map(_.filterNot(_.getCanonicalPath.contains("org/apache/spark/sql/hive/test")))
   }
 
+  val unidocSourceBase = settingKey[String]("Base URL of source links in Scaladoc.")
+
   lazy val settings = scalaJavaUnidocSettings ++ Seq (
     publish := {},
 
@@ -531,8 +537,19 @@ object Unidoc {
       "-noqualifier", "java.lang"
     ),
 
-    // Group similar methods together based on the @group annotation.
-    scalacOptions in (ScalaUnidoc, unidoc) ++= Seq("-groups")
+    // Use GitHub repository for Scaladoc source linke
+    unidocSourceBase := s"https://github.com/apache/spark/tree/v${version.value}",
+
+    scalacOptions in (ScalaUnidoc, unidoc) ++= Seq(
+      "-groups" // Group similar methods together based on the @group annotation.
+    ) ++ (
+      // Add links to sources when generating Scaladoc for a non-snapshot release
+      if (!isSnapshot.value) {
+        Opts.doc.sourceUrl(unidocSourceBase.value + "€{FILE_PATH}.scala")
+      } else {
+        Seq()
+      }
+    )
   )
 }
 

From 08698ee1d6f29b2c999416f18a074d5193cdacd5 Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Fri, 16 Oct 2015 14:26:34 +0100
Subject: [PATCH 074/139] [SPARK-11094] Strip extra strings from Java version
 in test runner

Removes any extra strings from the Java version, fixing subsequent integer parsing.
This is required since some OpenJDK versions (specifically in Debian testing), append an extra "-internal" string to the version field.

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9111 from jodersky/fixtestrunner.
---
 dev/run-tests.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 1a816585187d9..d4d6880491bc8 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -176,17 +176,14 @@ def determine_java_version(java_exe):
     # find raw version string, eg 'java version "1.8.0_25"'
     raw_version_str = next(x for x in raw_output_lines if " version " in x)
 
-    version_str = raw_version_str.split()[-1].strip('"')  # eg '1.8.0_25'
-    version, update = version_str.split('_')  # eg ['1.8.0', '25']
+    match = re.search('(\d+)\.(\d+)\.(\d+)_(\d+)', raw_version_str)
 
-    # map over the values and convert them to integers
-    version_info = [int(x) for x in version.split('.') + [update]]
-
-    return JavaVersion(major=version_info[0],
-                       minor=version_info[1],
-                       patch=version_info[2],
-                       update=version_info[3])
+    major = int(match.group(1))
+    minor = int(match.group(2))
+    patch = int(match.group(3))
+    update = int(match.group(4))
 
+    return JavaVersion(major, minor, patch, update)
 
 # -------------------------------------------------------------------------------------------------
 # Functions for running the other build and test scripts

From 4ee2cea2a43f7d04ab8511d9c029f80c5dadd48e Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Fri, 16 Oct 2015 17:24:18 +0100
Subject: [PATCH 075/139] [SPARK-11122] [BUILD] [WARN] Add tag to fatal
 warnings

Shows that an error is actually due to a fatal warning.

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9128 from jodersky/fatalwarnings.
---
 project/SparkBuild.scala | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index 8f0f310ddd24e..766edd9500c30 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -164,7 +164,7 @@ object SparkBuild extends PomBuild {
     // Code snippet taken from https://issues.scala-lang.org/browse/SI-8410.
     compile in Compile := {
       val analysis = (compile in Compile).value
-      val s = streams.value
+      val out = streams.value
 
       def logProblem(l: (=> String) => Unit, f: File, p: xsbti.Problem) = {
         l(f.toString + ":" + p.position.line.fold("")(_ + ":") + " " + p.message)
@@ -181,7 +181,14 @@ object SparkBuild extends PomBuild {
             failed = failed + 1
           }
 
-          logProblem(if (deprecation) s.log.warn else s.log.error, k, p)
+          val printer: (=> String) => Unit = s => if (deprecation) {
+            out.log.warn(s)
+          } else {
+            out.log.error("[warn] " + s)
+          }
+
+          logProblem(printer, k, p)
+
         }
       }
 

From b9c5e5d4ac4c9fe29e880f4ee562a9c552e81d29 Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Fri, 16 Oct 2015 11:19:37 -0700
Subject: [PATCH 076/139] [SPARK-11124] JsonParser/Generator should be closed
 for resource recycle

Some json parsers are not closed. parser in JacksonParser#parseJson, for example.

Author: navis.ryu <navis@apache.org>

Closes #9130 from navis/SPARK-11124.
---
 .../scala/org/apache/spark/util/Utils.scala   |  4 ++
 .../expressions/jsonExpressions.scala         | 56 +++++++++----------
 .../datasources/json/InferSchema.scala        |  8 ++-
 .../datasources/json/JacksonParser.scala      | 41 +++++++-------
 4 files changed, 57 insertions(+), 52 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index bd7e51c3b5100..22c05a2479422 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2153,6 +2153,10 @@ private[spark] object Utils extends Logging {
       conf.getInt("spark.executor.instances", 0) == 0
   }
 
+  def tryWithResource[R <: Closeable, T](createResource: => R)(f: R => T): T = {
+    val resource = createResource
+    try f.apply(resource) finally resource.close()
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
index 0770fab0ae901..8c9853e628d2c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.types.{StructField, StructType, StringType, DataType}
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.Utils
 
 import scala.util.parsing.combinator.RegexParsers
 
@@ -134,16 +135,18 @@ case class GetJsonObject(json: Expression, path: Expression)
 
     if (parsed.isDefined) {
       try {
-        val parser = jsonFactory.createParser(jsonStr.getBytes)
-        val output = new ByteArrayOutputStream()
-        val generator = jsonFactory.createGenerator(output, JsonEncoding.UTF8)
-        parser.nextToken()
-        val matched = evaluatePath(parser, generator, RawStyle, parsed.get)
-        generator.close()
-        if (matched) {
-          UTF8String.fromBytes(output.toByteArray)
-        } else {
-          null
+        Utils.tryWithResource(jsonFactory.createParser(jsonStr.getBytes)) { parser =>
+          val output = new ByteArrayOutputStream()
+          val matched = Utils.tryWithResource(
+            jsonFactory.createGenerator(output, JsonEncoding.UTF8)) { generator =>
+            parser.nextToken()
+            evaluatePath(parser, generator, RawStyle, parsed.get)
+          }
+          if (matched) {
+            UTF8String.fromBytes(output.toByteArray)
+          } else {
+            null
+          }
         }
       } catch {
         case _: JsonProcessingException => null
@@ -250,17 +253,18 @@ case class GetJsonObject(json: Expression, path: Expression)
         // temporarily buffer child matches, the emitted json will need to be
         // modified slightly if there is only a single element written
         val buffer = new StringWriter()
-        val flattenGenerator = jsonFactory.createGenerator(buffer)
-        flattenGenerator.writeStartArray()
 
         var dirty = 0
-        while (p.nextToken() != END_ARRAY) {
-          // track the number of array elements and only emit an outer array if
-          // we've written more than one element, this matches Hive's behavior
-          dirty += (if (evaluatePath(p, flattenGenerator, nextStyle, xs)) 1 else 0)
+        Utils.tryWithResource(jsonFactory.createGenerator(buffer)) { flattenGenerator =>
+          flattenGenerator.writeStartArray()
+
+          while (p.nextToken() != END_ARRAY) {
+            // track the number of array elements and only emit an outer array if
+            // we've written more than one element, this matches Hive's behavior
+            dirty += (if (evaluatePath(p, flattenGenerator, nextStyle, xs)) 1 else 0)
+          }
+          flattenGenerator.writeEndArray()
         }
-        flattenGenerator.writeEndArray()
-        flattenGenerator.close()
 
         val buf = buffer.getBuffer
         if (dirty > 1) {
@@ -370,12 +374,8 @@ case class JsonTuple(children: Seq[Expression])
     }
 
     try {
-      val parser = jsonFactory.createParser(json.getBytes)
-
-      try {
-        parseRow(parser, input)
-      } finally {
-        parser.close()
+      Utils.tryWithResource(jsonFactory.createParser(json.getBytes)) {
+        parser => parseRow(parser, input)
       }
     } catch {
       case _: JsonProcessingException =>
@@ -420,12 +420,8 @@ case class JsonTuple(children: Seq[Expression])
 
           // write the output directly to UTF8 encoded byte array
           if (parser.nextToken() != JsonToken.VALUE_NULL) {
-            val generator = jsonFactory.createGenerator(output, JsonEncoding.UTF8)
-
-            try {
-              copyCurrentStructure(generator, parser)
-            } finally {
-              generator.close()
+            Utils.tryWithResource(jsonFactory.createGenerator(output, JsonEncoding.UTF8)) {
+              generator => copyCurrentStructure(generator, parser)
             }
 
             row(idx) = UTF8String.fromBytes(output.toByteArray)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
index b6f3410bad690..d0780028dacb1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/InferSchema.scala
@@ -23,6 +23,7 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion
 import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.types._
+import org.apache.spark.util.Utils
 
 private[sql] object InferSchema {
   /**
@@ -47,9 +48,10 @@ private[sql] object InferSchema {
       val factory = new JsonFactory()
       iter.map { row =>
         try {
-          val parser = factory.createParser(row)
-          parser.nextToken()
-          inferField(parser)
+          Utils.tryWithResource(factory.createParser(row)) { parser =>
+            parser.nextToken()
+            inferField(parser)
+          }
         } catch {
           case _: JsonParseException =>
             StructType(Seq(StructField(columnNameOfCorruptRecords, StringType)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
index c51140749c8e6..09b8a9e936a1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonParser.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.datasources.json.JacksonUtils.nextUntil
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.Utils
 
 private[sql] object JacksonParser {
   def apply(
@@ -86,9 +87,9 @@ private[sql] object JacksonParser {
 
       case (_, StringType) =>
         val writer = new ByteArrayOutputStream()
-        val generator = factory.createGenerator(writer, JsonEncoding.UTF8)
-        generator.copyCurrentStructure(parser)
-        generator.close()
+        Utils.tryWithResource(factory.createGenerator(writer, JsonEncoding.UTF8)) {
+          generator => generator.copyCurrentStructure(parser)
+        }
         UTF8String.fromBytes(writer.toByteArray)
 
       case (VALUE_NUMBER_INT | VALUE_NUMBER_FLOAT, FloatType) =>
@@ -245,22 +246,24 @@ private[sql] object JacksonParser {
 
       iter.flatMap { record =>
         try {
-          val parser = factory.createParser(record)
-          parser.nextToken()
-
-          convertField(factory, parser, schema) match {
-            case null => failedRecord(record)
-            case row: InternalRow => row :: Nil
-            case array: ArrayData =>
-              if (array.numElements() == 0) {
-                Nil
-              } else {
-                array.toArray[InternalRow](schema)
-              }
-            case _ =>
-              sys.error(
-                s"Failed to parse record $record. Please make sure that each line of the file " +
-                  "(or each string in the RDD) is a valid JSON object or an array of JSON objects.")
+          Utils.tryWithResource(factory.createParser(record)) { parser =>
+            parser.nextToken()
+
+            convertField(factory, parser, schema) match {
+              case null => failedRecord(record)
+              case row: InternalRow => row :: Nil
+              case array: ArrayData =>
+                if (array.numElements() == 0) {
+                  Nil
+                } else {
+                  array.toArray[InternalRow](schema)
+                }
+              case _ =>
+                sys.error(
+                  s"Failed to parse record $record. Please make sure that each line of the file " +
+                    "(or each string in the RDD) is a valid JSON object or " +
+                    "an array of JSON objects.")
+            }
           }
         } catch {
           case _: JsonProcessingException =>

From 3d683a139b333456a6bd8801ac5f113d1ac3fd18 Mon Sep 17 00:00:00 2001
From: Pravin Gadakh <pravingadakh177@gmail.com>
Date: Fri, 16 Oct 2015 13:38:50 -0700
Subject: [PATCH 077/139] [SPARK-10581] [DOCS] Groups are not resolved in
 scaladoc in sql classes

Groups are not resolved properly in scaladoc in following classes:

sql/core/src/main/scala/org/apache/spark/sql/Column.scala
sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Author: Pravin Gadakh <pravingadakh177@gmail.com>

Closes #9148 from pravingadakh/master.
---
 sql/core/src/main/scala/org/apache/spark/sql/Column.scala | 8 ++++----
 .../src/main/scala/org/apache/spark/sql/SQLContext.scala  | 2 +-
 .../src/main/scala/org/apache/spark/sql/functions.scala   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 807bc8c30c12d..1f826887ac774 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -41,10 +41,10 @@ private[sql] object Column {
  * :: Experimental ::
  * A column in a [[DataFrame]].
  *
- * @groupname java_expr_ops Java-specific expression operators.
- * @groupname expr_ops Expression operators.
- * @groupname df_ops DataFrame functions.
- * @groupname Ungrouped Support functions for DataFrames.
+ * @groupname java_expr_ops Java-specific expression operators
+ * @groupname expr_ops Expression operators
+ * @groupname df_ops DataFrame functions
+ * @groupname Ungrouped Support functions for DataFrames
  *
  * @since 1.3.0
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index 361eb576c567a..e83657a60558d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -59,7 +59,7 @@ import org.apache.spark.util.Utils
  * @groupname specificdata Specific Data Sources
  * @groupname config Configuration
  * @groupname dataframes Custom DataFrame Creation
- * @groupname Ungrouped Support functions for language integrated queries.
+ * @groupname Ungrouped Support functions for language integrated queries
  *
  * @since 1.0.0
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 2467b4e48415b..15c864a8ab641 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -43,7 +43,7 @@ import org.apache.spark.util.Utils
  * @groupname window_funcs Window functions
  * @groupname string_funcs String functions
  * @groupname collection_funcs Collection functions
- * @groupname Ungrouped Support functions for DataFrames.
+ * @groupname Ungrouped Support functions for DataFrames
  * @since 1.3.0
  */
 @Experimental

From 369d786f58580e7df73e7e23f27390d37269d0de Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 16 Oct 2015 13:53:06 -0700
Subject: [PATCH 078/139] [SPARK-10974] [STREAMING] Add progress bar for output
 operation column and use red dots for failed batches

Screenshot:
<img width="1363" alt="1" src="https://cloud.githubusercontent.com/assets/1000778/10342571/385d9340-6d4c-11e5-8e79-1fa4c3c98f81.png">

Also fixed the description and duration for output operations that don't have spark jobs.
<img width="1354" alt="2" src="https://cloud.githubusercontent.com/assets/1000778/10342775/4bd52a0e-6d4d-11e5-99bc-26265a9fc792.png">

Author: zsxwing <zsxwing@gmail.com>

Closes #9010 from zsxwing/output-op-progress-bar.
---
 .../streaming/ui/static/streaming-page.js     |  26 ++-
 .../apache/spark/streaming/DStreamGraph.scala |   2 +-
 .../spark/streaming/scheduler/BatchInfo.scala |  23 +--
 .../spark/streaming/scheduler/Job.scala       |  30 ++-
 .../streaming/scheduler/JobScheduler.scala    |  12 +-
 .../spark/streaming/scheduler/JobSet.scala    |  17 +-
 .../scheduler/OutputOperationInfo.scala       |   6 +-
 .../spark/streaming/ui/AllBatchesTable.scala  |  40 ++--
 .../apache/spark/streaming/ui/BatchPage.scala | 174 +++++++-----------
 .../spark/streaming/ui/BatchUIData.scala      |  67 ++++++-
 .../ui/StreamingJobProgressListener.scala     |  14 ++
 .../streaming/StreamingListenerSuite.scala    |  16 +-
 .../spark/streaming/UISeleniumSuite.scala     |   2 +-
 .../StreamingJobProgressListenerSuite.scala   |  30 +--
 14 files changed, 258 insertions(+), 201 deletions(-)

diff --git a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
index 4886b68eeaf76..f82323a1cdd94 100644
--- a/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
+++ b/streaming/src/main/resources/org/apache/spark/streaming/ui/static/streaming-page.js
@@ -154,34 +154,40 @@ function drawTimeline(id, data, minX, maxX, minY, maxY, unitY, batchInterval) {
     var lastClickedBatch = null;
     var lastTimeout = null;
 
+    function isFailedBatch(batchTime) {
+        return $("#batch-" + batchTime).attr("isFailed") == "true";
+    }
+
     // Add points to the line. However, we make it invisible at first. But when the user moves mouse
     // over a point, it will be displayed with its detail.
     svg.selectAll(".point")
         .data(data)
         .enter().append("circle")
-            .attr("stroke", "white") // white and opacity = 0 make it invisible
-            .attr("fill", "white")
-            .attr("opacity", "0")
+            .attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "white";}) // white and opacity = 0 make it invisible
+            .attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
+            .attr("opacity", function(d) { return isFailedBatch(d.x) ? "1" : "0";})
             .style("cursor", "pointer")
             .attr("cx", function(d) { return x(d.x); })
             .attr("cy", function(d) { return y(d.y); })
-            .attr("r", function(d) { return 3; })
+            .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "0";})
             .on('mouseover', function(d) {
                 var tip = formatYValue(d.y) + " " + unitY + " at " + timeFormat[d.x];
                 showBootstrapTooltip(d3.select(this).node(), tip);
                 // show the point
                 d3.select(this)
-                    .attr("stroke", "steelblue")
-                    .attr("fill", "steelblue")
-                    .attr("opacity", "1");
+                    .attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "steelblue";})
+                    .attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "steelblue";})
+                    .attr("opacity", "1")
+                    .attr("r", "3");
             })
             .on('mouseout',  function() {
                 hideBootstrapTooltip(d3.select(this).node());
                 // hide the point
                 d3.select(this)
-                    .attr("stroke", "white")
-                    .attr("fill", "white")
-                    .attr("opacity", "0");
+                    .attr("stroke", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
+                    .attr("fill", function(d) { return isFailedBatch(d.x) ? "red" : "white";})
+                    .attr("opacity", function(d) { return isFailedBatch(d.x) ? "1" : "0";})
+                    .attr("r", function(d) { return isFailedBatch(d.x) ? "2" : "0";});
             })
             .on("click", function(d) {
                 if (lastTimeout != null) {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
index de79c9ef1abfa..1b0b7890b3b00 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
@@ -113,7 +113,7 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
     val jobs = this.synchronized {
       outputStreams.flatMap { outputStream =>
         val jobOption = outputStream.generateJob(time)
-        jobOption.foreach(_.setCallSite(outputStream.creationSite.longForm))
+        jobOption.foreach(_.setCallSite(outputStream.creationSite))
         jobOption
       }
     }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
index 463f899dc249b..436eb0a566141 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/BatchInfo.scala
@@ -29,6 +29,7 @@ import org.apache.spark.streaming.Time
  *                        the streaming scheduler queue
  * @param processingStartTime Clock time of when the first job of this batch started processing
  * @param processingEndTime Clock time of when the last job of this batch finished processing
+ * @param outputOperationInfos The output operations in this batch
  */
 @DeveloperApi
 case class BatchInfo(
@@ -36,13 +37,10 @@ case class BatchInfo(
     streamIdToInputInfo: Map[Int, StreamInputInfo],
     submissionTime: Long,
     processingStartTime: Option[Long],
-    processingEndTime: Option[Long]
+    processingEndTime: Option[Long],
+    outputOperationInfos: Map[Int, OutputOperationInfo]
   ) {
 
-  private var _failureReasons: Map[Int, String] = Map.empty
-
-  private var _numOutputOp: Int = 0
-
   @deprecated("Use streamIdToInputInfo instead", "1.5.0")
   def streamIdToNumRecords: Map[Int, Long] = streamIdToInputInfo.mapValues(_.numRecords)
 
@@ -72,19 +70,4 @@ case class BatchInfo(
    */
   def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
 
-  /** Set the failure reasons corresponding to every output ops in the batch */
-  private[streaming] def setFailureReason(reasons: Map[Int, String]): Unit = {
-    _failureReasons = reasons
-  }
-
-  /** Failure reasons corresponding to every output ops in the batch */
-  private[streaming] def failureReasons = _failureReasons
-
-  /** Set the number of output operations in this batch */
-  private[streaming] def setNumOutputOp(numOutputOp: Int): Unit = {
-    _numOutputOp = numOutputOp
-  }
-
-  /** Return the number of output operations in this batch */
-  private[streaming] def numOutputOp: Int = _numOutputOp
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
index 1373053f064f3..ab1b3565fcc19 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/Job.scala
@@ -17,8 +17,10 @@
 
 package org.apache.spark.streaming.scheduler
 
+import scala.util.{Failure, Try}
+
 import org.apache.spark.streaming.Time
-import scala.util.Try
+import org.apache.spark.util.{Utils, CallSite}
 
 /**
  * Class representing a Spark computation. It may contain multiple Spark jobs.
@@ -29,7 +31,9 @@ class Job(val time: Time, func: () => _) {
   private var _outputOpId: Int = _
   private var isSet = false
   private var _result: Try[_] = null
-  private var _callSite: String = "Unknown"
+  private var _callSite: CallSite = null
+  private var _startTime: Option[Long] = None
+  private var _endTime: Option[Long] = None
 
   def run() {
     _result = Try(func())
@@ -71,11 +75,29 @@ class Job(val time: Time, func: () => _) {
     _outputOpId = outputOpId
   }
 
-  def setCallSite(callSite: String): Unit = {
+  def setCallSite(callSite: CallSite): Unit = {
     _callSite = callSite
   }
 
-  def callSite: String = _callSite
+  def callSite: CallSite = _callSite
+
+  def setStartTime(startTime: Long): Unit = {
+    _startTime = Some(startTime)
+  }
+
+  def setEndTime(endTime: Long): Unit = {
+    _endTime = Some(endTime)
+  }
+
+  def toOutputOperationInfo: OutputOperationInfo = {
+    val failureReason = if (_result != null && _result.isFailure) {
+      Some(Utils.exceptionString(_result.asInstanceOf[Failure[_]].exception))
+    } else {
+      None
+    }
+    OutputOperationInfo(
+      time, outputOpId, callSite.shortForm, callSite.longForm, _startTime, _endTime, failureReason)
+  }
 
   override def toString: String = id
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
index 0a4a396a0f498..2480b4ec093e2 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobScheduler.scala
@@ -20,13 +20,13 @@ package org.apache.spark.streaming.scheduler
 import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 
 import scala.collection.JavaConverters._
-import scala.util.{Failure, Success}
+import scala.util.Failure
 
 import org.apache.spark.Logging
 import org.apache.spark.rdd.PairRDDFunctions
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.ui.UIUtils
-import org.apache.spark.util.{EventLoop, ThreadUtils}
+import org.apache.spark.util.{EventLoop, ThreadUtils, Utils}
 
 
 private[scheduler] sealed trait JobSchedulerEvent
@@ -162,16 +162,16 @@ class JobScheduler(val ssc: StreamingContext) extends Logging {
       // correct "jobSet.processingStartTime".
       listenerBus.post(StreamingListenerBatchStarted(jobSet.toBatchInfo))
     }
-    listenerBus.post(StreamingListenerOutputOperationStarted(
-      OutputOperationInfo(job.time, job.outputOpId, job.callSite, Some(startTime), None)))
+    job.setStartTime(startTime)
+    listenerBus.post(StreamingListenerOutputOperationStarted(job.toOutputOperationInfo))
     logInfo("Starting job " + job.id + " from job set of time " + jobSet.time)
   }
 
   private def handleJobCompletion(job: Job, completedTime: Long) {
     val jobSet = jobSets.get(job.time)
     jobSet.handleJobCompletion(job)
-    listenerBus.post(StreamingListenerOutputOperationCompleted(
-      OutputOperationInfo(job.time, job.outputOpId, job.callSite, None, Some(completedTime))))
+    job.setEndTime(completedTime)
+    listenerBus.post(StreamingListenerOutputOperationCompleted(job.toOutputOperationInfo))
     logInfo("Finished job " + job.id + " from job set of time " + jobSet.time)
     if (jobSet.hasCompleted) {
       jobSets.remove(jobSet.time)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
index 08f63cc99268f..f76300351e3c0 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/JobSet.scala
@@ -64,24 +64,13 @@ case class JobSet(
   }
 
   def toBatchInfo: BatchInfo = {
-    val failureReasons: Map[Int, String] = {
-      if (hasCompleted) {
-        jobs.filter(_.result.isFailure).map { job =>
-          (job.outputOpId, Utils.exceptionString(job.result.asInstanceOf[Failure[_]].exception))
-        }.toMap
-      } else {
-        Map.empty
-      }
-    }
-    val binfo = new BatchInfo(
+    BatchInfo(
       time,
       streamIdToInputInfo,
       submissionTime,
       if (processingStartTime >= 0) Some(processingStartTime) else None,
-      if (processingEndTime >= 0) Some(processingEndTime) else None
+      if (processingEndTime >= 0) Some(processingEndTime) else None,
+      jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap
     )
-    binfo.setFailureReason(failureReasons)
-    binfo.setNumOutputOp(jobs.size)
-    binfo
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
index d5614b343912b..137e512a670da 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/OutputOperationInfo.scala
@@ -25,17 +25,21 @@ import org.apache.spark.streaming.Time
  * Class having information on output operations.
  * @param batchTime Time of the batch
  * @param id Id of this output operation. Different output operations have different ids in a batch.
+ * @param name The name of this output operation.
  * @param description The description of this output operation.
  * @param startTime Clock time of when the output operation started processing
  * @param endTime Clock time of when the output operation started processing
+ * @param failureReason Failure reason if this output operation fails
  */
 @DeveloperApi
 case class OutputOperationInfo(
     batchTime: Time,
     id: Int,
+    name: String,
     description: String,
     startTime: Option[Long],
-    endTime: Option[Long]) {
+    endTime: Option[Long],
+    failureReason: Option[String]) {
 
   /**
    * Return the duration of this output operation.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
index 3e6590d66f587..125cafd41b8af 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/AllBatchesTable.scala
@@ -17,9 +17,6 @@
 
 package org.apache.spark.streaming.ui
 
-import java.text.SimpleDateFormat
-import java.util.Date
-
 import scala.xml.Node
 
 import org.apache.spark.ui.{UIUtils => SparkUIUtils}
@@ -46,7 +43,8 @@ private[ui] abstract class BatchTableBase(tableId: String, batchInterval: Long)
     val formattedProcessingTime = processingTime.map(SparkUIUtils.formatDuration).getOrElse("-")
     val batchTimeId = s"batch-$batchTime"
 
-    <td id={batchTimeId} sorttable_customkey={batchTime.toString}>
+    <td id={batchTimeId} sorttable_customkey={batchTime.toString}
+        isFailed={batch.isFailed.toString}>
       <a href={s"batch?id=$batchTime"}>
         {formattedBatchTime}
       </a>
@@ -75,6 +73,19 @@ private[ui] abstract class BatchTableBase(tableId: String, batchInterval: Long)
     batchTable
   }
 
+  protected def createOutputOperationProgressBar(batch: BatchUIData): Seq[Node] = {
+    <td class="progress-cell">
+      {
+      SparkUIUtils.makeProgressBar(
+        started = batch.numActiveOutputOp,
+        completed = batch.numCompletedOutputOp,
+        failed = batch.numFailedOutputOp,
+        skipped = 0,
+        total = batch.outputOperations.size)
+      }
+    </td>
+  }
+
   /**
    * Return HTML for all rows of this table.
    */
@@ -86,7 +97,10 @@ private[ui] class ActiveBatchTable(
     waitingBatches: Seq[BatchUIData],
     batchInterval: Long) extends BatchTableBase("active-batches-table", batchInterval) {
 
-  override protected def columns: Seq[Node] = super.columns ++ <th>Status</th>
+  override protected def columns: Seq[Node] = super.columns ++ {
+    <th>Output Ops: Succeeded/Total</th>
+      <th>Status</th>
+  }
 
   override protected def renderRows: Seq[Node] = {
     // The "batchTime"s of "waitingBatches" must be greater than "runningBatches"'s, so display
@@ -96,11 +110,11 @@ private[ui] class ActiveBatchTable(
   }
 
   private def runningBatchRow(batch: BatchUIData): Seq[Node] = {
-    baseRow(batch) ++ <td>processing</td>
+    baseRow(batch) ++ createOutputOperationProgressBar(batch) ++ <td>processing</td>
   }
 
   private def waitingBatchRow(batch: BatchUIData): Seq[Node] = {
-    baseRow(batch) ++ <td>queued</td>
+    baseRow(batch) ++ createOutputOperationProgressBar(batch) ++ <td>queued</td>
   }
 }
 
@@ -119,17 +133,11 @@ private[ui] class CompletedBatchTable(batches: Seq[BatchUIData], batchInterval:
   private def completedBatchRow(batch: BatchUIData): Seq[Node] = {
     val totalDelay = batch.totalDelay
     val formattedTotalDelay = totalDelay.map(SparkUIUtils.formatDuration).getOrElse("-")
-    val numFailedOutputOp = batch.failureReason.size
-    val outputOpColumn = if (numFailedOutputOp > 0) {
-        s"${batch.numOutputOp - numFailedOutputOp}/${batch.numOutputOp}" +
-          s" (${numFailedOutputOp} failed)"
-      } else {
-        s"${batch.numOutputOp}/${batch.numOutputOp}"
-      }
-    baseRow(batch) ++
+
+    baseRow(batch) ++ {
       <td sorttable_customkey={totalDelay.getOrElse(Long.MaxValue).toString}>
         {formattedTotalDelay}
       </td>
-      <td>{outputOpColumn}</td>
+    } ++ createOutputOperationProgressBar(batch)
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
index a19b85a51d289..2ed925572826e 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchPage.scala
@@ -47,32 +47,30 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   private def generateJobRow(
-      outputOpId: OutputOpId,
+      outputOpData: OutputOperationUIData,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
-      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       sparkJob: SparkJobIdWithUIData): Seq[Node] = {
     if (sparkJob.jobUIData.isDefined) {
-      generateNormalJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration,
-        outputOpStatus, numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get)
+      generateNormalJobRow(outputOpData, outputOpDescription, formattedOutputOpDuration,
+        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.jobUIData.get)
     } else {
-      generateDroppedJobRow(outputOpId, outputOpDescription, formattedOutputOpDuration,
-        outputOpStatus, numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId)
+      generateDroppedJobRow(outputOpData, outputOpDescription, formattedOutputOpDuration,
+        numSparkJobRowsInOutputOp, isFirstRow, sparkJob.sparkJobId)
     }
   }
 
   private def generateOutputOpRowWithoutSparkJobs(
-    outputOpId: OutputOpId,
+    outputOpData: OutputOperationUIData,
     outputOpDescription: Seq[Node],
-    formattedOutputOpDuration: String,
-    outputOpStatus: String): Seq[Node] = {
+    formattedOutputOpDuration: String): Seq[Node] = {
     <tr>
-      <td class="output-op-id-cell" >{outputOpId.toString}</td>
+      <td class="output-op-id-cell" >{outputOpData.id.toString}</td>
       <td>{outputOpDescription}</td>
       <td>{formattedOutputOpDuration}</td>
-      {outputOpStatusCell(outputOpStatus, rowspan = 1)}
+      {outputOpStatusCell(outputOpData, rowspan = 1)}
       <!-- Job Id -->
       <td>-</td>
       <!-- Duration -->
@@ -91,10 +89,9 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
    * one cell, we use "rowspan" for the first row of a output op.
    */
   private def generateNormalJobRow(
-      outputOpId: OutputOpId,
+      outputOpData: OutputOperationUIData,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
-      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       sparkJob: JobUIData): Seq[Node] = {
@@ -116,12 +113,12 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     // scalastyle:off
     val prefixCells =
       if (isFirstRow) {
-        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpId.toString}</td>
+        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpData.id.toString}</td>
         <td rowspan={numSparkJobRowsInOutputOp.toString}>
           {outputOpDescription}
         </td>
         <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td> ++
-        {outputOpStatusCell(outputOpStatus, numSparkJobRowsInOutputOp)}
+        {outputOpStatusCell(outputOpData, numSparkJobRowsInOutputOp)}
       } else {
         Nil
       }
@@ -161,10 +158,9 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
    * with "-" cells.
    */
   private def generateDroppedJobRow(
-      outputOpId: OutputOpId,
+      outputOpData: OutputOperationUIData,
       outputOpDescription: Seq[Node],
       formattedOutputOpDuration: String,
-      outputOpStatus: String,
       numSparkJobRowsInOutputOp: Int,
       isFirstRow: Boolean,
       jobId: Int): Seq[Node] = {
@@ -173,10 +169,10 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     // scalastyle:off
     val prefixCells =
       if (isFirstRow) {
-        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpId.toString}</td>
+        <td class="output-op-id-cell" rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpData.id.toString}</td>
           <td rowspan={numSparkJobRowsInOutputOp.toString}>{outputOpDescription}</td>
           <td rowspan={numSparkJobRowsInOutputOp.toString}>{formattedOutputOpDuration}</td> ++
-          {outputOpStatusCell(outputOpStatus, numSparkJobRowsInOutputOp)}
+          {outputOpStatusCell(outputOpData, numSparkJobRowsInOutputOp)}
       } else {
         Nil
       }
@@ -199,47 +195,34 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
   }
 
   private def generateOutputOpIdRow(
-      outputOpId: OutputOpId,
-      outputOpStatus: String,
+      outputOpData: OutputOperationUIData,
       sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = {
-    // We don't count the durations of dropped jobs
-    val sparkJobDurations = sparkJobs.filter(_.jobUIData.nonEmpty).map(_.jobUIData.get).
-      map(sparkJob => {
-        sparkJob.submissionTime.map { start =>
-          val end = sparkJob.completionTime.getOrElse(System.currentTimeMillis())
-          end - start
-        }
-      })
     val formattedOutputOpDuration =
-      if (sparkJobDurations.isEmpty || sparkJobDurations.exists(_ == None)) {
-        // If no job or any job does not finish, set "formattedOutputOpDuration" to "-"
+      if (outputOpData.duration.isEmpty) {
         "-"
       } else {
-        SparkUIUtils.formatDuration(sparkJobDurations.flatMap(x => x).sum)
+        SparkUIUtils.formatDuration(outputOpData.duration.get)
       }
 
-    val description = generateOutputOpDescription(sparkJobs)
+    val description = generateOutputOpDescription(outputOpData)
 
     if (sparkJobs.isEmpty) {
-      generateOutputOpRowWithoutSparkJobs(
-        outputOpId, description, formattedOutputOpDuration, outputOpStatus)
+      generateOutputOpRowWithoutSparkJobs(outputOpData, description, formattedOutputOpDuration)
     } else {
       val firstRow =
         generateJobRow(
-          outputOpId,
+          outputOpData,
           description,
           formattedOutputOpDuration,
-          outputOpStatus,
           sparkJobs.size,
           true,
           sparkJobs.head)
       val tailRows =
         sparkJobs.tail.map { sparkJob =>
           generateJobRow(
-            outputOpId,
+            outputOpData,
             description,
             formattedOutputOpDuration,
-            outputOpStatus,
             sparkJobs.size,
             false,
             sparkJob)
@@ -248,35 +231,18 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     }
   }
 
-  private def generateOutputOpDescription(sparkJobs: Seq[SparkJobIdWithUIData]): Seq[Node] = {
-    val lastStageInfo =
-      sparkJobs.flatMap(_.jobUIData).headOption. // Get the first JobUIData
-        flatMap { sparkJob => // For the first job, get the latest Stage info
-          if (sparkJob.stageIds.isEmpty) {
-            None
-          } else {
-            sparkListener.stageIdToInfo.get(sparkJob.stageIds.max)
-          }
-        }
-    lastStageInfo match {
-      case Some(stageInfo) =>
-        val details = if (stageInfo.details.nonEmpty) {
-          <span
-            onclick="this.parentNode.querySelector('.stage-details').classList.toggle('collapsed')"
-            class="expand-details">
-              +details
-          </span> ++
-          <div class="stage-details collapsed">
-            <pre>{stageInfo.details}</pre>
-          </div>
-        } else {
-          NodeSeq.Empty
-        }
-
-        <div> {stageInfo.name} {details} </div>
-      case None =>
-        Text("(Unknown)")
-    }
+  private def generateOutputOpDescription(outputOp: OutputOperationUIData): Seq[Node] = {
+    <div>
+      {outputOp.name}
+      <span
+        onclick="this.parentNode.querySelector('.stage-details').classList.toggle('collapsed')"
+        class="expand-details">
+          +details
+      </span>
+      <div class="stage-details collapsed">
+        <pre>{outputOp.description}</pre>
+      </div>
+    </div>
   }
 
   private def failureReasonCell(
@@ -329,6 +295,19 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     }
   }
 
+  private def generateOutputOperationStatusForUI(failure: String): String = {
+    if (failure.startsWith("org.apache.spark.SparkException")) {
+      "Failed due to Spark job error\n" + failure
+    } else {
+      var nextLineIndex = failure.indexOf("\n")
+      if (nextLineIndex < 0) {
+        nextLineIndex = failure.size
+      }
+      val firstLine = failure.substring(0, nextLineIndex)
+      s"Failed due to error: $firstLine\n$failure"
+    }
+  }
+
   /**
    * Generate the job table for the batch.
    */
@@ -338,26 +317,15 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         // sort SparkJobIds for each OutputOpId
         (outputOpId, outputOpIdAndSparkJobIds.map(_.sparkJobId).sorted)
       }
-    val outputOps = (0 until batchUIData.numOutputOp).map { outputOpId =>
-      val status = batchUIData.failureReason.get(outputOpId).map { failure =>
-        if (failure.startsWith("org.apache.spark.SparkException")) {
-          "Failed due to Spark job error\n" + failure
-        } else {
-          var nextLineIndex = failure.indexOf("\n")
-          if (nextLineIndex < 0) {
-            nextLineIndex = failure.size
-          }
-          val firstLine = failure.substring(0, nextLineIndex)
-          s"Failed due to error: $firstLine\n$failure"
-        }
-      }.getOrElse("Succeeded")
-      val sparkJobIds = outputOpIdToSparkJobIds.getOrElse(outputOpId, Seq.empty)
-      (outputOpId, status, sparkJobIds)
-    }
+
+    val outputOps: Seq[(OutputOperationUIData, Seq[SparkJobId])] =
+      batchUIData.outputOperations.map { case (outputOpId, outputOperation) =>
+        val sparkJobIds = outputOpIdToSparkJobIds.getOrElse(outputOpId, Seq.empty)
+        (outputOperation, sparkJobIds)
+      }.toSeq.sortBy(_._1.id)
     sparkListener.synchronized {
-      val outputOpIdWithJobs: Seq[(OutputOpId, String, Seq[SparkJobIdWithUIData])] =
-        outputOps.map { case (outputOpId, status, sparkJobIds) =>
-          (outputOpId, status,
+      val outputOpWithJobs = outputOps.map { case (outputOpData, sparkJobIds) =>
+          (outputOpData,
             sparkJobIds.map(sparkJobId => SparkJobIdWithUIData(sparkJobId, getJobData(sparkJobId))))
         }
 
@@ -367,9 +335,8 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         </thead>
         <tbody>
           {
-            outputOpIdWithJobs.map {
-              case (outputOpId, status, sparkJobIds) =>
-                generateOutputOpIdRow(outputOpId, status, sparkJobIds)
+            outputOpWithJobs.map { case (outputOpData, sparkJobIds) =>
+              generateOutputOpIdRow(outputOpData, sparkJobIds)
             }
           }
         </tbody>
@@ -377,7 +344,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
     }
   }
 
-  def render(request: HttpServletRequest): Seq[Node] = {
+  def render(request: HttpServletRequest): Seq[Node] = streamingListener.synchronized {
     val batchTime = Option(request.getParameter("id")).map(id => Time(id.toLong)).getOrElse {
       throw new IllegalArgumentException(s"Missing id parameter")
     }
@@ -430,14 +397,7 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
         </ul>
       </div>
 
-    val jobTable =
-      if (batchUIData.outputOpIdSparkJobIdPairs.isEmpty) {
-        <div>Cannot find any job for Batch {formattedBatchTime}.</div>
-      } else {
-        generateJobTable(batchUIData)
-      }
-
-    val content = summary ++ jobTable
+    val content = summary ++ generateJobTable(batchUIData)
 
     SparkUIUtils.headerSparkPage(s"Details of batch at $formattedBatchTime", content, parent)
   }
@@ -471,11 +431,17 @@ private[ui] class BatchPage(parent: StreamingTab) extends WebUIPage("batch") {
       replaceAllLiterally("\t", "&nbsp;&nbsp;&nbsp;&nbsp;").replaceAllLiterally("\n", "<br/>"))
   }
 
-  private def outputOpStatusCell(status: String, rowspan: Int): Seq[Node] = {
-    if (status == "Succeeded") {
-      <td rowspan={rowspan.toString}>Succeeded</td>
-    } else {
-      failureReasonCell(status, rowspan, includeFirstLineInExpandDetails = false)
+  private def outputOpStatusCell(outputOp: OutputOperationUIData, rowspan: Int): Seq[Node] = {
+    outputOp.failureReason match {
+      case Some(failureReason) =>
+        val failureReasonForUI = generateOutputOperationStatusForUI(failureReason)
+        failureReasonCell(failureReasonForUI, rowspan, includeFirstLineInExpandDetails = false)
+      case None =>
+        if (outputOp.endTime.isEmpty) {
+          <td rowspan={rowspan.toString}>-</td>
+        } else {
+          <td rowspan={rowspan.toString}>Succeeded</td>
+        }
     }
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
index e6c2e2140c6c4..3ef3689de1c45 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/BatchUIData.scala
@@ -18,8 +18,10 @@
 
 package org.apache.spark.streaming.ui
 
+import scala.collection.mutable
+
 import org.apache.spark.streaming.Time
-import org.apache.spark.streaming.scheduler.{BatchInfo, StreamInputInfo}
+import org.apache.spark.streaming.scheduler.{BatchInfo, OutputOperationInfo, StreamInputInfo}
 import org.apache.spark.streaming.ui.StreamingJobProgressListener._
 
 private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId)
@@ -30,8 +32,7 @@ private[ui] case class BatchUIData(
     val submissionTime: Long,
     val processingStartTime: Option[Long],
     val processingEndTime: Option[Long],
-    val numOutputOp: Int,
-    val failureReason: Map[Int, String],
+    val outputOperations: mutable.HashMap[OutputOpId, OutputOperationUIData] = mutable.HashMap(),
     var outputOpIdSparkJobIdPairs: Seq[OutputOpIdAndSparkJobId] = Seq.empty) {
 
   /**
@@ -61,19 +62,75 @@ private[ui] case class BatchUIData(
    * The number of recorders received by the receivers in this batch.
    */
   def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum
+
+  /**
+   * Update an output operation information of this batch.
+   */
+  def updateOutputOperationInfo(outputOperationInfo: OutputOperationInfo): Unit = {
+    assert(batchTime == outputOperationInfo.batchTime)
+    outputOperations(outputOperationInfo.id) = OutputOperationUIData(outputOperationInfo)
+  }
+
+  /**
+   * Return the number of failed output operations.
+   */
+  def numFailedOutputOp: Int = outputOperations.values.count(_.failureReason.nonEmpty)
+
+  /**
+   * Return the number of running output operations.
+   */
+  def numActiveOutputOp: Int = outputOperations.values.count(_.endTime.isEmpty)
+
+  /**
+   * Return the number of completed output operations.
+   */
+  def numCompletedOutputOp: Int = outputOperations.values.count {
+      op => op.failureReason.isEmpty && op.endTime.nonEmpty
+    }
+
+  /**
+   * Return if this batch has any output operations
+   */
+  def isFailed: Boolean = numFailedOutputOp != 0
 }
 
 private[ui] object BatchUIData {
 
   def apply(batchInfo: BatchInfo): BatchUIData = {
+    val outputOperations = mutable.HashMap[OutputOpId, OutputOperationUIData]()
+    outputOperations ++= batchInfo.outputOperationInfos.mapValues(OutputOperationUIData.apply)
     new BatchUIData(
       batchInfo.batchTime,
       batchInfo.streamIdToInputInfo,
       batchInfo.submissionTime,
       batchInfo.processingStartTime,
       batchInfo.processingEndTime,
-      batchInfo.numOutputOp,
-      batchInfo.failureReasons
+      outputOperations
+    )
+  }
+}
+
+private[ui] case class OutputOperationUIData(
+    id: OutputOpId,
+    name: String,
+    description: String,
+    startTime: Option[Long],
+    endTime: Option[Long],
+    failureReason: Option[String]) {
+
+  def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s
+}
+
+private[ui] object OutputOperationUIData {
+
+  def apply(outputOperationInfo: OutputOperationInfo): OutputOperationUIData = {
+    OutputOperationUIData(
+      outputOperationInfo.id,
+      outputOperationInfo.name,
+      outputOperationInfo.description,
+      outputOperationInfo.startTime,
+      outputOperationInfo.endTime,
+      outputOperationInfo.failureReason
     )
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
index 78aeb004e18b1..f6cc6edf2569a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingJobProgressListener.scala
@@ -119,6 +119,20 @@ private[streaming] class StreamingJobProgressListener(ssc: StreamingContext)
     }
   }
 
+  override def onOutputOperationStarted(
+      outputOperationStarted: StreamingListenerOutputOperationStarted): Unit = synchronized {
+    // This method is called after onBatchStarted
+    runningBatchUIData(outputOperationStarted.outputOperationInfo.batchTime).
+      updateOutputOperationInfo(outputOperationStarted.outputOperationInfo)
+  }
+
+  override def onOutputOperationCompleted(
+      outputOperationCompleted: StreamingListenerOutputOperationCompleted): Unit = synchronized {
+    // This method is called before onBatchCompleted
+    runningBatchUIData(outputOperationCompleted.outputOperationInfo.batchTime).
+      updateOutputOperationInfo(outputOperationCompleted.outputOperationInfo)
+  }
+
   override def onJobStart(jobStart: SparkListenerJobStart): Unit = synchronized {
     getBatchTimeAndOutputOpId(jobStart.properties).foreach { case (batchTime, outputOpId) =>
       var outputOpIdToSparkJobIds = batchTimeToOutputOpIdSparkJobIdPair.get(batchTime)
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
index 2b43b7467042b..5dc0472c7770c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/StreamingListenerSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.streaming
 
-import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer}
+import scala.collection.mutable.{ArrayBuffer, HashMap, SynchronizedBuffer, SynchronizedMap}
 import scala.concurrent.Future
 import scala.concurrent.ExecutionContext.Implicits.global
 
@@ -221,7 +221,7 @@ class StreamingListenerSuite extends TestSuiteBase with Matchers {
       }
     }
     _ssc.stop()
-    failureReasonsCollector.failureReasons
+    failureReasonsCollector.failureReasons.toMap
   }
 
   /** Check if a sequence of numbers is in increasing order */
@@ -307,14 +307,16 @@ class StreamingListenerSuiteReceiver extends Receiver[Any](StorageLevel.MEMORY_O
 }
 
 /**
- * A StreamingListener that saves the latest `failureReasons` in `BatchInfo` to the `failureReasons`
- * field.
+ * A StreamingListener that saves all latest `failureReasons` in a batch.
  */
 class FailureReasonsCollector extends StreamingListener {
 
-  @volatile var failureReasons: Map[Int, String] = null
+  val failureReasons = new HashMap[Int, String] with SynchronizedMap[Int, String]
 
-  override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
-    failureReasons = batchCompleted.batchInfo.failureReasons
+  override def onOutputOperationCompleted(
+      outputOperationCompleted: StreamingListenerOutputOperationCompleted): Unit = {
+    outputOperationCompleted.outputOperationInfo.failureReason.foreach { f =>
+      failureReasons(outputOperationCompleted.outputOperationInfo.id) = f
+    }
   }
 }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
index d1df78871d3b8..a5744a9009c1c 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/UISeleniumSuite.scala
@@ -117,7 +117,7 @@ class UISeleniumSuite
 
         findAll(cssSelector("""#active-batches-table th""")).map(_.text).toSeq should be {
           List("Batch Time", "Input Size", "Scheduling Delay (?)", "Processing Time (?)",
-            "Status")
+            "Output Ops: Succeeded/Total", "Status")
         }
         findAll(cssSelector("""#completed-batches-table th""")).map(_.text).toSeq should be {
           List("Batch Time", "Input Size", "Scheduling Delay (?)", "Processing Time (?)",
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
index 995f1197ccdfd..af4718b4eb705 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ui/StreamingJobProgressListenerSuite.scala
@@ -63,7 +63,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
       1 -> StreamInputInfo(1, 300L, Map(StreamInputInfo.METADATA_KEY_DESCRIPTION -> "test")))
 
     // onBatchSubmitted
-    val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None)
+    val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None, Map.empty)
     listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
     listener.waitingBatches should be (List(BatchUIData(batchInfoSubmitted)))
     listener.runningBatches should be (Nil)
@@ -75,7 +75,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.numTotalReceivedRecords should be (0)
 
     // onBatchStarted
-    val batchInfoStarted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+    val batchInfoStarted =
+      BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
     listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (List(BatchUIData(batchInfoStarted)))
@@ -116,7 +117,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
         OutputOpIdAndSparkJobId(1, 1))
 
     // onBatchCompleted
-    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+    val batchInfoCompleted =
+      BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
     listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     listener.waitingBatches should be (Nil)
     listener.runningBatches should be (Nil)
@@ -156,7 +158,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
     val streamIdToInputInfo = Map(0 -> StreamInputInfo(0, 300L), 1 -> StreamInputInfo(1, 300L))
 
-    val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+    val batchInfoCompleted =
+      BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
 
     for(_ <- 0 until (limit + 10)) {
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
@@ -173,8 +176,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
     // fulfill completedBatchInfos
     for(i <- 0 until limit) {
-      val batchInfoCompleted =
-        BatchInfo(Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None)
+      val batchInfoCompleted = BatchInfo(
+          Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None, Map.empty)
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
       val jobStart = createJobStart(Time(1000 + i * 100), outputOpId = 0, jobId = 1)
       listener.onJobStart(jobStart)
@@ -185,7 +188,7 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
     listener.onJobStart(jobStart)
 
     val batchInfoSubmitted =
-      BatchInfo(Time(1000 + limit * 100), Map.empty, (1000 + limit * 100), None, None)
+      BatchInfo(Time(1000 + limit * 100), Map.empty, (1000 + limit * 100), None, None, Map.empty)
     listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
 
     // We still can see the info retrieved from onJobStart
@@ -201,8 +204,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
 
     // A lot of "onBatchCompleted"s happen before "onJobStart"
     for(i <- limit + 1 to limit * 2) {
-      val batchInfoCompleted =
-        BatchInfo(Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None)
+      val batchInfoCompleted = BatchInfo(
+          Time(1000 + i * 100), Map.empty, 1000 + i * 100, Some(2000 + i * 100), None, Map.empty)
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     }
 
@@ -227,11 +230,13 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
       val streamIdToInputInfo = Map(0 -> StreamInputInfo(0, 300L), 1 -> StreamInputInfo(1, 300L))
 
       // onBatchSubmitted
-      val batchInfoSubmitted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None)
+      val batchInfoSubmitted =
+        BatchInfo(Time(1000), streamIdToInputInfo, 1000, None, None, Map.empty)
       listener.onBatchSubmitted(StreamingListenerBatchSubmitted(batchInfoSubmitted))
 
       // onBatchStarted
-      val batchInfoStarted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+      val batchInfoStarted =
+        BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
       listener.onBatchStarted(StreamingListenerBatchStarted(batchInfoStarted))
 
       // onJobStart
@@ -248,7 +253,8 @@ class StreamingJobProgressListenerSuite extends TestSuiteBase with Matchers {
       listener.onJobStart(jobStart4)
 
       // onBatchCompleted
-      val batchInfoCompleted = BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None)
+      val batchInfoCompleted =
+        BatchInfo(Time(1000), streamIdToInputInfo, 1000, Some(2000), None, Map.empty)
       listener.onBatchCompleted(StreamingListenerBatchCompleted(batchInfoCompleted))
     }
 

From e1eef248f13f6c334fe4eea8a29a1de5470a2e62 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Fri, 16 Oct 2015 13:56:51 -0700
Subject: [PATCH 079/139] [SPARK-11104] [STREAMING] Fix a deadlock in
 StreamingContex.stop

The following deadlock may happen if shutdownHook and StreamingContext.stop are running at the same time.
```
Java stack information for the threads listed above:
===================================================
"Thread-2":
	at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:699)
	- waiting to lock <0x00000005405a1680> (a org.apache.spark.streaming.StreamingContext)
	at org.apache.spark.streaming.StreamingContext.org$apache$spark$streaming$StreamingContext$$stopOnShutdown(StreamingContext.scala:729)
	at org.apache.spark.streaming.StreamingContext$$anonfun$start$1.apply$mcV$sp(StreamingContext.scala:625)
	at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:266)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:236)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1697)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:236)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:236)
	at scala.util.Try$.apply(Try.scala:161)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:236)
	- locked <0x00000005405b6a00> (a org.apache.spark.util.SparkShutdownHookManager)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:216)
	at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
"main":
	at org.apache.spark.util.SparkShutdownHookManager.remove(ShutdownHookManager.scala:248)
	- waiting to lock <0x00000005405b6a00> (a org.apache.spark.util.SparkShutdownHookManager)
	at org.apache.spark.util.ShutdownHookManager$.removeShutdownHook(ShutdownHookManager.scala:199)
	at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:712)
	- locked <0x00000005405a1680> (a org.apache.spark.streaming.StreamingContext)
	at org.apache.spark.streaming.StreamingContext.stop(StreamingContext.scala:684)
	- locked <0x00000005405a1680> (a org.apache.spark.streaming.StreamingContext)
	at org.apache.spark.streaming.SessionByKeyBenchmark$.main(SessionByKeyBenchmark.scala:108)
	at org.apache.spark.streaming.SessionByKeyBenchmark.main(SessionByKeyBenchmark.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:680)
	at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:120)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
```

This PR just moved `ShutdownHookManager.removeShutdownHook` out of `synchronized` to avoid deadlock.

Author: zsxwing <zsxwing@gmail.com>

Closes #9116 from zsxwing/stop-deadlock.
---
 .../spark/streaming/StreamingContext.scala    | 55 +++++++++++--------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 9b2632c229548..051f53de64cd5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -694,32 +694,39 @@ class StreamingContext private[streaming] (
    * @param stopGracefully if true, stops gracefully by waiting for the processing of all
    *                       received data to be completed
    */
-  def stop(stopSparkContext: Boolean, stopGracefully: Boolean): Unit = synchronized {
-    try {
-      state match {
-        case INITIALIZED =>
-          logWarning("StreamingContext has not been started yet")
-        case STOPPED =>
-          logWarning("StreamingContext has already been stopped")
-        case ACTIVE =>
-          scheduler.stop(stopGracefully)
-          // Removing the streamingSource to de-register the metrics on stop()
-          env.metricsSystem.removeSource(streamingSource)
-          uiTab.foreach(_.detach())
-          StreamingContext.setActiveContext(null)
-          waiter.notifyStop()
-          if (shutdownHookRef != null) {
-            ShutdownHookManager.removeShutdownHook(shutdownHookRef)
-          }
-          logInfo("StreamingContext stopped successfully")
+  def stop(stopSparkContext: Boolean, stopGracefully: Boolean): Unit = {
+    var shutdownHookRefToRemove: AnyRef = null
+    synchronized {
+      try {
+        state match {
+          case INITIALIZED =>
+            logWarning("StreamingContext has not been started yet")
+          case STOPPED =>
+            logWarning("StreamingContext has already been stopped")
+          case ACTIVE =>
+            scheduler.stop(stopGracefully)
+            // Removing the streamingSource to de-register the metrics on stop()
+            env.metricsSystem.removeSource(streamingSource)
+            uiTab.foreach(_.detach())
+            StreamingContext.setActiveContext(null)
+            waiter.notifyStop()
+            if (shutdownHookRef != null) {
+              shutdownHookRefToRemove = shutdownHookRef
+              shutdownHookRef = null
+            }
+            logInfo("StreamingContext stopped successfully")
+        }
+      } finally {
+        // The state should always be Stopped after calling `stop()`, even if we haven't started yet
+        state = STOPPED
       }
-      // Even if we have already stopped, we still need to attempt to stop the SparkContext because
-      // a user might stop(stopSparkContext = false) and then call stop(stopSparkContext = true).
-      if (stopSparkContext) sc.stop()
-    } finally {
-      // The state should always be Stopped after calling `stop()`, even if we haven't started yet
-      state = STOPPED
     }
+    if (shutdownHookRefToRemove != null) {
+      ShutdownHookManager.removeShutdownHook(shutdownHookRefToRemove)
+    }
+    // Even if we have already stopped, we still need to attempt to stop the SparkContext because
+    // a user might stop(stopSparkContext = false) and then call stop(stopSparkContext = true).
+    if (stopSparkContext) sc.stop()
   }
 
   private def stopOnShutdown(): Unit = {

From ac09a3a465f3b57f3964c5fd621ab0d2216e2354 Mon Sep 17 00:00:00 2001
From: gweidner <gweidner@us.ibm.com>
Date: Fri, 16 Oct 2015 14:02:12 -0700
Subject: [PATCH 080/139] [SPARK-11109] [CORE] Move FsHistoryProvider off
 deprecated AccessControlException

Switched from deprecated org.apache.hadoop.fs.permission.AccessControlException to org.apache.hadoop.security.AccessControlException.

Author: gweidner <gweidner@us.ibm.com>

Closes #9144 from gweidner/SPARK-11109.
---
 .../org/apache/spark/deploy/history/FsHistoryProvider.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
index 5eb8adf97d90b..80bfda9dddb39 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala
@@ -27,7 +27,7 @@ import scala.collection.mutable
 import com.google.common.io.ByteStreams
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
 import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
-import org.apache.hadoop.fs.permission.AccessControlException
+import org.apache.hadoop.security.AccessControlException
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf, SparkException}
 import org.apache.spark.deploy.SparkHadoopUtil

From 1ec0a0dc2819d3db3555799cb78c2946f652bff4 Mon Sep 17 00:00:00 2001
From: Bhargav Mangipudi <bhargav.mangipudi@gmail.com>
Date: Fri, 16 Oct 2015 14:36:05 -0700
Subject: [PATCH 081/139] =?UTF-8?q?[SPARK-11050]=20[MLLIB]=20PySpark=20Spa?=
 =?UTF-8?q?rseVector=20can=20return=20wrong=20index=20in=20e=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…rror message

For negative indices in the SparseVector, we update the index value. If we have an incorrect index
at this point, the error message has the incorrect *updated* index instead of the original one. This
change contains the fix for the same.

Author: Bhargav Mangipudi <bhargav.mangipudi@gmail.com>

Closes #9069 from bhargav/spark-10759.
---
 python/pyspark/mllib/linalg/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index d903b9030d8ce..5276eb41cf29e 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -764,10 +764,11 @@ def __getitem__(self, index):
         if not isinstance(index, int):
             raise TypeError(
                 "Indices must be of type integer, got type %s" % type(index))
+
+        if index >= self.size or index < -self.size:
+            raise ValueError("Index %d out of bounds." % index)
         if index < 0:
             index += self.size
-        if index >= self.size or index < 0:
-            raise ValueError("Index %d out of bounds." % index)
 
         insert_index = np.searchsorted(inds, index)
         if insert_index >= inds.size:

From 10046ea76cf8f0d08fe7ef548e4dbec69d9c73b8 Mon Sep 17 00:00:00 2001
From: Burak Yavuz <brkyvz@gmail.com>
Date: Fri, 16 Oct 2015 15:30:07 -0700
Subject: [PATCH 082/139] [SPARK-10599] [MLLIB] Lower communication for block
 matrix multiplication

This PR aims to decrease communication costs in BlockMatrix multiplication in two ways:
 - Simulate the multiplication on the driver, and figure out which blocks actually need to be shuffled
 - Send the block once to a partition, and join inside the partition rather than sending multiple copies to the same partition

**NOTE**: One important note is that right now, the old behavior of checking for multiple blocks with the same index is lost. This is not hard to add, but is a little more expensive than how it was.

Initial benchmarking showed promising results (look below), however I did hit some `FileNotFound` exceptions with the new implementation after the shuffle.

Size A: 1e5 x 1e5
Size B: 1e5 x 1e5
Block Sizes: 1024 x 1024
Sparsity: 0.01
Old implementation: 1m 13s
New implementation: 9s

cc avulanov Would you be interested in helping me benchmark this? I used your code from the mailing list (which you sent about 3 months ago?), and the old implementation didn't even run, but the new implementation completed in 268s in a 120 GB / 16 core cluster

Author: Burak Yavuz <brkyvz@gmail.com>

Closes #8757 from brkyvz/opt-bmm.
---
 .../linalg/distributed/BlockMatrix.scala      | 80 ++++++++++++++-----
 .../linalg/distributed/BlockMatrixSuite.scala | 18 +++++
 2 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
index a33b6137cf9cc..81a6c0550bda7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrix.scala
@@ -54,12 +54,14 @@ private[mllib] class GridPartitioner(
   /**
    * Returns the index of the partition the input coordinate belongs to.
    *
-   * @param key The coordinate (i, j) or a tuple (i, j, k), where k is the inner index used in
-   *            multiplication. k is ignored in computing partitions.
+   * @param key The partition id i (calculated through this method for coordinate (i, j) in
+   *            `simulateMultiply`, the coordinate (i, j) or a tuple (i, j, k), where k is
+   *            the inner index used in multiplication. k is ignored in computing partitions.
    * @return The index of the partition, which the coordinate belongs to.
    */
   override def getPartition(key: Any): Int = {
     key match {
+      case i: Int => i
       case (i: Int, j: Int) =>
         getPartitionId(i, j)
       case (i: Int, j: Int, _: Int) =>
@@ -352,12 +354,49 @@ class BlockMatrix @Since("1.3.0") (
     }
   }
 
+  /** Block (i,j) --> Set of destination partitions */
+  private type BlockDestinations = Map[(Int, Int), Set[Int]]
+
+  /**
+   * Simulate the multiplication with just block indices in order to cut costs on communication,
+   * when we are actually shuffling the matrices.
+   * The `colsPerBlock` of this matrix must equal the `rowsPerBlock` of `other`.
+   * Exposed for tests.
+   *
+   * @param other The BlockMatrix to multiply
+   * @param partitioner The partitioner that will be used for the resulting matrix `C = A * B`
+   * @return A tuple of [[BlockDestinations]]. The first element is the Map of the set of partitions
+   *         that we need to shuffle each blocks of `this`, and the second element is the Map for
+   *         `other`.
+   */
+  private[distributed] def simulateMultiply(
+      other: BlockMatrix,
+      partitioner: GridPartitioner): (BlockDestinations, BlockDestinations) = {
+    val leftMatrix = blockInfo.keys.collect() // blockInfo should already be cached
+    val rightMatrix = other.blocks.keys.collect()
+    val leftDestinations = leftMatrix.map { case (rowIndex, colIndex) =>
+      val rightCounterparts = rightMatrix.filter(_._1 == colIndex)
+      val partitions = rightCounterparts.map(b => partitioner.getPartition((rowIndex, b._2)))
+      ((rowIndex, colIndex), partitions.toSet)
+    }.toMap
+    val rightDestinations = rightMatrix.map { case (rowIndex, colIndex) =>
+      val leftCounterparts = leftMatrix.filter(_._2 == rowIndex)
+      val partitions = leftCounterparts.map(b => partitioner.getPartition((b._1, colIndex)))
+      ((rowIndex, colIndex), partitions.toSet)
+    }.toMap
+    (leftDestinations, rightDestinations)
+  }
+
   /**
    * Left multiplies this [[BlockMatrix]] to `other`, another [[BlockMatrix]]. The `colsPerBlock`
    * of this matrix must equal the `rowsPerBlock` of `other`. If `other` contains
    * [[SparseMatrix]], they will have to be converted to a [[DenseMatrix]]. The output
    * [[BlockMatrix]] will only consist of blocks of [[DenseMatrix]]. This may cause
    * some performance issues until support for multiplying two sparse matrices is added.
+   *
+   * Note: The behavior of multiply has changed in 1.6.0. `multiply` used to throw an error when
+   * there were blocks with duplicate indices. Now, the blocks with duplicate indices will be added
+   * with each other.
    */
   @Since("1.3.0")
   def multiply(other: BlockMatrix): BlockMatrix = {
@@ -368,33 +407,30 @@ class BlockMatrix @Since("1.3.0") (
     if (colsPerBlock == other.rowsPerBlock) {
       val resultPartitioner = GridPartitioner(numRowBlocks, other.numColBlocks,
         math.max(blocks.partitions.length, other.blocks.partitions.length))
-      // Each block of A must be multiplied with the corresponding blocks in each column of B.
-      // TODO: Optimize to send block to a partition once, similar to ALS
+      val (leftDestinations, rightDestinations) = simulateMultiply(other, resultPartitioner)
+      // Each block of A must be multiplied with the corresponding blocks in the columns of B.
       val flatA = blocks.flatMap { case ((blockRowIndex, blockColIndex), block) =>
-        Iterator.tabulate(other.numColBlocks)(j => ((blockRowIndex, j, blockColIndex), block))
+        val destinations = leftDestinations.getOrElse((blockRowIndex, blockColIndex), Set.empty)
+        destinations.map(j => (j, (blockRowIndex, blockColIndex, block)))
       }
       // Each block of B must be multiplied with the corresponding blocks in each row of A.
       val flatB = other.blocks.flatMap { case ((blockRowIndex, blockColIndex), block) =>
-        Iterator.tabulate(numRowBlocks)(i => ((i, blockColIndex, blockRowIndex), block))
+        val destinations = rightDestinations.getOrElse((blockRowIndex, blockColIndex), Set.empty)
+        destinations.map(j => (j, (blockRowIndex, blockColIndex, block)))
       }
-      val newBlocks: RDD[MatrixBlock] = flatA.cogroup(flatB, resultPartitioner)
-        .flatMap { case ((blockRowIndex, blockColIndex, _), (a, b)) =>
-          if (a.size > 1 || b.size > 1) {
-            throw new SparkException("There are multiple MatrixBlocks with indices: " +
-              s"($blockRowIndex, $blockColIndex). Please remove them.")
-          }
-          if (a.nonEmpty && b.nonEmpty) {
-            val C = b.head match {
-              case dense: DenseMatrix => a.head.multiply(dense)
-              case sparse: SparseMatrix => a.head.multiply(sparse.toDense)
-              case _ => throw new SparkException(s"Unrecognized matrix type ${b.head.getClass}.")
+      val newBlocks = flatA.cogroup(flatB, resultPartitioner).flatMap { case (pId, (a, b)) =>
+        a.flatMap { case (leftRowIndex, leftColIndex, leftBlock) =>
+          b.filter(_._1 == leftColIndex).map { case (rightRowIndex, rightColIndex, rightBlock) =>
+            val C = rightBlock match {
+              case dense: DenseMatrix => leftBlock.multiply(dense)
+              case sparse: SparseMatrix => leftBlock.multiply(sparse.toDense)
+              case _ =>
+                throw new SparkException(s"Unrecognized matrix type ${rightBlock.getClass}.")
             }
-            Iterator(((blockRowIndex, blockColIndex), C.toBreeze))
-          } else {
-            Iterator()
+            ((leftRowIndex, rightColIndex), C.toBreeze)
           }
-      }.reduceByKey(resultPartitioner, (a, b) => a + b)
-        .mapValues(Matrices.fromBreeze)
+        }
+      }.reduceByKey(resultPartitioner, (a, b) => a + b).mapValues(Matrices.fromBreeze)
       // TODO: Try to use aggregateByKey instead of reduceByKey to get rid of intermediate matrices
       new BlockMatrix(newBlocks, rowsPerBlock, other.colsPerBlock, numRows(), other.numCols())
     } else {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
index 93fe04c139b9a..b8eb10305801c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/BlockMatrixSuite.scala
@@ -235,6 +235,24 @@ class BlockMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(localC ~== result absTol 1e-8)
   }
 
+  test("simulate multiply") {
+    val blocks: Seq[((Int, Int), Matrix)] = Seq(
+      ((0, 0), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))),
+      ((1, 1), new DenseMatrix(2, 2, Array(1.0, 0.0, 0.0, 1.0))))
+    val rdd = sc.parallelize(blocks, 2)
+    val B = new BlockMatrix(rdd, colPerPart, rowPerPart)
+    val resultPartitioner = GridPartitioner(gridBasedMat.numRowBlocks, B.numColBlocks,
+      math.max(numPartitions, 2))
+    val (destinationsA, destinationsB) = gridBasedMat.simulateMultiply(B, resultPartitioner)
+    assert(destinationsA((0, 0)) === Set(0))
+    assert(destinationsA((0, 1)) === Set(2))
+    assert(destinationsA((1, 0)) === Set(0))
+    assert(destinationsA((1, 1)) === Set(2))
+    assert(destinationsA((2, 1)) === Set(3))
+    assert(destinationsB((0, 0)) === Set(0))
+    assert(destinationsB((1, 1)) === Set(2, 3))
+  }
+
   test("validate") {
     // No error
     gridBasedMat.validate()

From 8ac71d62d976bbfd0159cac6816dd8fa580ae1cb Mon Sep 17 00:00:00 2001
From: zero323 <matthew.szymkiewicz@gmail.com>
Date: Fri, 16 Oct 2015 15:53:26 -0700
Subject: [PATCH 083/139] [SPARK-11084] [ML] [PYTHON] Check if index can
 contain non-zero value before binary search

At this moment `SparseVector.__getitem__` executes `np.searchsorted` first and checks if result is in an expected range after that. It is possible to check if index can contain non-zero value before executing `np.searchsorted`.

Author: zero323 <matthew.szymkiewicz@gmail.com>

Closes #9098 from zero323/sparse_vector_getitem_improved.
---
 python/pyspark/mllib/linalg/__init__.py |  4 ++--
 python/pyspark/mllib/tests.py           | 10 ++++++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/mllib/linalg/__init__.py b/python/pyspark/mllib/linalg/__init__.py
index 5276eb41cf29e..ae9ce58450905 100644
--- a/python/pyspark/mllib/linalg/__init__.py
+++ b/python/pyspark/mllib/linalg/__init__.py
@@ -770,10 +770,10 @@ def __getitem__(self, index):
         if index < 0:
             index += self.size
 
-        insert_index = np.searchsorted(inds, index)
-        if insert_index >= inds.size:
+        if (inds.size == 0) or (index > inds.item(-1)):
             return 0.
 
+        insert_index = np.searchsorted(inds, index)
         row_ind = inds[insert_index]
         if row_ind == index:
             return vals[insert_index]
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 2a6a5cd3fe40e..2ad69a0ab1d3d 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -252,6 +252,16 @@ def test_sparse_vector_indexing(self):
         for ind in [7.8, '1']:
             self.assertRaises(TypeError, sv.__getitem__, ind)
 
+        zeros = SparseVector(4, {})
+        self.assertEqual(zeros[0], 0.0)
+        self.assertEqual(zeros[3], 0.0)
+        for ind in [4, -5]:
+            self.assertRaises(ValueError, zeros.__getitem__, ind)
+
+        empty = SparseVector(0, {})
+        for ind in [-1, 0, 1]:
+            self.assertRaises(ValueError, empty.__getitem__, ind)
+
     def test_matrix_indexing(self):
         mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
         expected = [[0, 6], [1, 8], [4, 10]]

From e1e77b22b3b577909a12c3aa898eb53be02267fd Mon Sep 17 00:00:00 2001
From: Yuhao Yang <hhbyyh@gmail.com>
Date: Sat, 17 Oct 2015 10:04:19 -0700
Subject: [PATCH 084/139] [SPARK-11029] [ML] Add computeCost to KMeansModel in
 spark.ml

jira: https://issues.apache.org/jira/browse/SPARK-11029

We should add a method analogous to spark.mllib.clustering.KMeansModel.computeCost to spark.ml.clustering.KMeansModel.
This will be a temp fix until we have proper evaluators defined for clustering.

Author: Yuhao Yang <hhbyyh@gmail.com>
Author: yuhaoyang <yuhao@zhanglipings-iMac.local>

Closes #9073 from hhbyyh/computeCost.
---
 .../org/apache/spark/ml/clustering/KMeans.scala      | 12 ++++++++++++
 .../org/apache/spark/ml/clustering/KMeansSuite.scala |  1 +
 2 files changed, 13 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index f40ab71fb22a6..509be63002396 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -117,6 +117,18 @@ class KMeansModel private[ml] (
 
   @Since("1.5.0")
   def clusterCenters: Array[Vector] = parentModel.clusterCenters
+
+  /**
+   * Return the K-means cost (sum of squared distances of points to their nearest center) for this
+   * model on the given data.
+   */
+  // TODO: Replace the temp fix when we have proper evaluators defined for clustering.
+  @Since("1.6.0")
+  def computeCost(dataset: DataFrame): Double = {
+    SchemaUtils.checkColumnType(dataset.schema, $(featuresCol), new VectorUDT)
+    val data = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
+    parentModel.computeCost(data)
+  }
 }
 
 /**
diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index 688b0e31f91dc..c05f90550d161 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -104,5 +104,6 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext {
     val clusters = transformed.select(predictionColName).map(_.getInt(0)).distinct().collect().toSet
     assert(clusters.size === k)
     assert(clusters === Set(0, 1, 2, 3, 4))
+    assert(model.computeCost(dataset) < 0.1)
   }
 }

From cca2258685147be6c950c9f5c4e50eaa1e090714 Mon Sep 17 00:00:00 2001
From: Luvsandondov Lkhamsuren <lkhamsurenl@gmail.com>
Date: Sat, 17 Oct 2015 10:07:42 -0700
Subject: [PATCH 085/139] [SPARK-9963] [ML] RandomForest cleanup: replace
 predictNodeIndex with predictImpl

predictNodeIndex is moved to LearningNode and renamed predictImpl for consistency with Node.predictImpl

Author: Luvsandondov Lkhamsuren <lkhamsurenl@gmail.com>

Closes #8609 from lkhamsurenl/SPARK-9963.
---
 .../scala/org/apache/spark/ml/tree/Node.scala | 37 ++++++++++++++++
 .../spark/ml/tree/impl/RandomForest.scala     | 44 +------------------
 2 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
index cd24931293903..d89682611e3f5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -279,6 +279,43 @@ private[tree] class LearningNode(
     }
   }
 
+  /**
+   * Get the node index corresponding to this data point.
+   * This function mimics prediction, passing an example from the root node down to a leaf
+   * or unsplit node; that node's index is returned.
+   *
+   * @param binnedFeatures  Binned feature vector for data point.
+   * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
+   * @return  Leaf index if the data point reaches a leaf.
+   *          Otherwise, last node reachable in tree matching this example.
+   *          Note: This is the global node index, i.e., the index used in the tree.
+   *                This index is different from the index used during training a particular
+   *                group of nodes on one call to [[findBestSplits()]].
+   */
+  def predictImpl(binnedFeatures: Array[Int], splits: Array[Array[Split]]): Int = {
+    if (this.isLeaf || this.split.isEmpty) {
+      this.id
+    } else {
+      val split = this.split.get
+      val featureIndex = split.featureIndex
+      val splitLeft = split.shouldGoLeft(binnedFeatures(featureIndex), splits(featureIndex))
+      if (this.leftChild.isEmpty) {
+        // Not yet split. Return next layer of nodes to train
+        if (splitLeft) {
+          LearningNode.leftChildIndex(this.id)
+        } else {
+          LearningNode.rightChildIndex(this.id)
+        }
+      } else {
+        if (splitLeft) {
+          this.leftChild.get.predictImpl(binnedFeatures, splits)
+        } else {
+          this.rightChild.get.predictImpl(binnedFeatures, splits)
+        }
+      }
+    }
+  }
+
 }
 
 private[tree] object LearningNode {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
index c494556085e95..96d5652857e08 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -205,47 +205,6 @@ private[ml] object RandomForest extends Logging {
     }
   }
 
-  /**
-   * Get the node index corresponding to this data point.
-   * This function mimics prediction, passing an example from the root node down to a leaf
-   * or unsplit node; that node's index is returned.
-   *
-   * @param node  Node in tree from which to classify the given data point.
-   * @param binnedFeatures  Binned feature vector for data point.
-   * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
-   * @return  Leaf index if the data point reaches a leaf.
-   *          Otherwise, last node reachable in tree matching this example.
-   *          Note: This is the global node index, i.e., the index used in the tree.
-   *                This index is different from the index used during training a particular
-   *                group of nodes on one call to [[findBestSplits()]].
-   */
-  private def predictNodeIndex(
-      node: LearningNode,
-      binnedFeatures: Array[Int],
-      splits: Array[Array[Split]]): Int = {
-    if (node.isLeaf || node.split.isEmpty) {
-      node.id
-    } else {
-      val split = node.split.get
-      val featureIndex = split.featureIndex
-      val splitLeft = split.shouldGoLeft(binnedFeatures(featureIndex), splits(featureIndex))
-      if (node.leftChild.isEmpty) {
-        // Not yet split. Return index from next layer of nodes to train
-        if (splitLeft) {
-          LearningNode.leftChildIndex(node.id)
-        } else {
-          LearningNode.rightChildIndex(node.id)
-        }
-      } else {
-        if (splitLeft) {
-          predictNodeIndex(node.leftChild.get, binnedFeatures, splits)
-        } else {
-          predictNodeIndex(node.rightChild.get, binnedFeatures, splits)
-        }
-      }
-    }
-  }
-
   /**
    * Helper for binSeqOp, for data which can contain a mix of ordered and unordered features.
    *
@@ -453,8 +412,7 @@ private[ml] object RandomForest extends Logging {
         agg: Array[DTStatsAggregator],
         baggedPoint: BaggedPoint[TreePoint]): Array[DTStatsAggregator] = {
       treeToNodeToIndexInfo.foreach { case (treeIndex, nodeIndexToInfo) =>
-        val nodeIndex =
-          predictNodeIndex(topNodes(treeIndex), baggedPoint.datum.binnedFeatures, splits)
+        val nodeIndex = topNodes(treeIndex).predictImpl(baggedPoint.datum.binnedFeatures, splits)
         nodeBinSeqOp(treeIndex, nodeIndexToInfo.getOrElse(nodeIndex, null), agg, baggedPoint)
       }
       agg

From 254937420678a299f06b6f4e2696c623da56cf3a Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sat, 17 Oct 2015 12:41:42 -0700
Subject: [PATCH 086/139] [SPARK-11165] Logging trait should be private - not
 DeveloperApi.

Its classdoc actually says; "NOTE: DO NOT USE this class outside of Spark. It is intended as an internal utility."

Author: Reynold Xin <rxin@databricks.com>

Closes #9155 from rxin/private-logging-trait.
---
 core/src/main/scala/org/apache/spark/Logging.scala | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index f0598816d6c07..69f6e06ee0057 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -21,11 +21,10 @@ import org.apache.log4j.{LogManager, PropertyConfigurator}
 import org.slf4j.{Logger, LoggerFactory}
 import org.slf4j.impl.StaticLoggerBinder
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.Private
 import org.apache.spark.util.Utils
 
 /**
- * :: DeveloperApi ::
  * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows
  * logging messages at different levels using methods that only evaluate parameters lazily if the
  * log level is enabled.
@@ -33,7 +32,7 @@ import org.apache.spark.util.Utils
  * NOTE: DO NOT USE this class outside of Spark. It is intended as an internal utility.
  *       This will likely be changed or removed in future releases.
  */
-@DeveloperApi
+@Private
 trait Logging {
   // Make the log field transient so that objects with Logging can
   // be serialized and used on another machine

From 57f83e36d63bbd79663c49a6c1e8f6c3c8fe4789 Mon Sep 17 00:00:00 2001
From: Koert Kuipers <koert@tresata.com>
Date: Sat, 17 Oct 2015 14:56:24 -0700
Subject: [PATCH 087/139] [SPARK-10185] [SQL] Feat sql comma separated paths

Make sure comma-separated paths get processed correcly in ResolvedDataSource for a HadoopFsRelationProvider

Author: Koert Kuipers <koert@tresata.com>

Closes #8416 from koertkuipers/feat-sql-comma-separated-paths.
---
 python/pyspark/sql/readwriter.py              | 14 +++++-
 python/test_support/sql/people1.json          |  2 +
 .../apache/spark/sql/DataFrameReader.scala    | 11 +++++
 .../datasources/ResolvedDataSource.scala      | 47 +++++++++++++++----
 .../org/apache/spark/sql/DataFrameSuite.scala | 18 +++++++
 5 files changed, 81 insertions(+), 11 deletions(-)
 create mode 100644 python/test_support/sql/people1.json

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index f43d8bf646a9e..93832d4c713e5 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -116,6 +116,10 @@ def load(self, path=None, format=None, schema=None, **options):
         ...     opt2=1, opt3='str')
         >>> df.dtypes
         [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
+        >>> df = sqlContext.read.format('json').load(['python/test_support/sql/people.json',
+        ...     'python/test_support/sql/people1.json'])
+        >>> df.dtypes
+        [('age', 'bigint'), ('aka', 'string'), ('name', 'string')]
         """
         if format is not None:
             self.format(format)
@@ -123,7 +127,15 @@ def load(self, path=None, format=None, schema=None, **options):
             self.schema(schema)
         self.options(**options)
         if path is not None:
-            return self._df(self._jreader.load(path))
+            if type(path) == list:
+                paths = path
+                gateway = self._sqlContext._sc._gateway
+                jpaths = gateway.new_array(gateway.jvm.java.lang.String, len(paths))
+                for i in range(0, len(paths)):
+                    jpaths[i] = paths[i]
+                return self._df(self._jreader.load(jpaths))
+            else:
+                return self._df(self._jreader.load(path))
         else:
             return self._df(self._jreader.load())
 
diff --git a/python/test_support/sql/people1.json b/python/test_support/sql/people1.json
new file mode 100644
index 0000000000000..6d217da77d155
--- /dev/null
+++ b/python/test_support/sql/people1.json
@@ -0,0 +1,2 @@
+{"name":"Jonathan", "aka": "John"}
+
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index eacdea2c1e5b3..e8651a3569d6f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -22,6 +22,7 @@ import java.util.Properties
 import scala.collection.JavaConverters._
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.util.StringUtils
 
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
@@ -123,6 +124,16 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
     DataFrame(sqlContext, LogicalRelation(resolved.relation))
   }
 
+  /**
+   * Loads input in as a [[DataFrame]], for data sources that support multiple paths.
+   * Only works if the source is a HadoopFsRelationProvider.
+   *
+   * @since 1.6.0
+   */
+  def load(paths: Array[String]): DataFrame = {
+    option("paths", paths.map(StringUtils.escapeString(_, '\\', ',')).mkString(",")).load()
+  }
+
   /**
    * Construct a [[DataFrame]] representing the database table accessible via JDBC URL
    * url named table and connection properties.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
index 011724436621d..54beabbf63b5f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ResolvedDataSource.scala
@@ -24,6 +24,7 @@ import scala.language.{existentials, implicitConversions}
 import scala.util.{Success, Failure, Try}
 
 import org.apache.hadoop.fs.Path
+import org.apache.hadoop.util.StringUtils
 
 import org.apache.spark.Logging
 import org.apache.spark.deploy.SparkHadoopUtil
@@ -89,7 +90,11 @@ object ResolvedDataSource extends Logging {
     val relation = userSpecifiedSchema match {
       case Some(schema: StructType) => clazz.newInstance() match {
         case dataSource: SchemaRelationProvider =>
-          dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options), schema)
+          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
+          if (caseInsensitiveOptions.contains("paths")) {
+            throw new AnalysisException(s"$className does not support paths option.")
+          }
+          dataSource.createRelation(sqlContext, caseInsensitiveOptions, schema)
         case dataSource: HadoopFsRelationProvider =>
           val maybePartitionsSchema = if (partitionColumns.isEmpty) {
             None
@@ -99,10 +104,19 @@ object ResolvedDataSource extends Logging {
 
           val caseInsensitiveOptions = new CaseInsensitiveMap(options)
           val paths = {
-            val patternPath = new Path(caseInsensitiveOptions("path"))
-            val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-            val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-            SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray
+            if (caseInsensitiveOptions.contains("paths") &&
+              caseInsensitiveOptions.contains("path")) {
+              throw new AnalysisException(s"Both path and paths options are present.")
+            }
+            caseInsensitiveOptions.get("paths")
+              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
+              .getOrElse(Array(caseInsensitiveOptions("path")))
+              .flatMap{ pathString =>
+                val hdfsPath = new Path(pathString)
+                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
+              }
           }
 
           val dataSchema =
@@ -122,14 +136,27 @@ object ResolvedDataSource extends Logging {
 
       case None => clazz.newInstance() match {
         case dataSource: RelationProvider =>
-          dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options))
+          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
+          if (caseInsensitiveOptions.contains("paths")) {
+            throw new AnalysisException(s"$className does not support paths option.")
+          }
+          dataSource.createRelation(sqlContext, caseInsensitiveOptions)
         case dataSource: HadoopFsRelationProvider =>
           val caseInsensitiveOptions = new CaseInsensitiveMap(options)
           val paths = {
-            val patternPath = new Path(caseInsensitiveOptions("path"))
-            val fs = patternPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
-            val qualifiedPattern = patternPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
-            SparkHadoopUtil.get.globPathIfNecessary(qualifiedPattern).map(_.toString).toArray
+            if (caseInsensitiveOptions.contains("paths") &&
+              caseInsensitiveOptions.contains("path")) {
+              throw new AnalysisException(s"Both path and paths options are present.")
+            }
+            caseInsensitiveOptions.get("paths")
+              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
+              .getOrElse(Array(caseInsensitiveOptions("path")))
+              .flatMap{ pathString =>
+                val hdfsPath = new Path(pathString)
+                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
+                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
+              }
           }
           dataSource.createRelation(sqlContext, paths, None, None, caseInsensitiveOptions)
         case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index d919877746c72..832ea02cb6e77 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -890,6 +890,24 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       .collect()
   }
 
+  test("SPARK-10185: Read multiple Hadoop Filesystem paths and paths with a comma in it") {
+    withTempDir { dir =>
+      val df1 = Seq((1, 22)).toDF("a", "b")
+      val dir1 = new File(dir, "dir,1").getCanonicalPath
+      df1.write.format("json").save(dir1)
+
+      val df2 = Seq((2, 23)).toDF("a", "b")
+      val dir2 = new File(dir, "dir2").getCanonicalPath
+      df2.write.format("json").save(dir2)
+
+      checkAnswer(sqlContext.read.format("json").load(Array(dir1, dir2)),
+        Row(1, 22) :: Row(2, 23) :: Nil)
+
+      checkAnswer(sqlContext.read.format("json").load(dir1),
+        Row(1, 22) :: Nil)
+    }
+  }
+
   test("SPARK-10034: Sort on Aggregate with aggregation expression named 'aggOrdering'") {
     val df = Seq(1 -> 2).toDF("i", "j")
     val query = df.groupBy('i)

From 022a8f6a1f7cb477a65a65482982c021ce08a73c Mon Sep 17 00:00:00 2001
From: ph <ph@plista.com>
Date: Sat, 17 Oct 2015 15:37:51 -0700
Subject: [PATCH 088/139] [SPARK-11129] [MESOS] Link Spark WebUI from Mesos
 WebUI

Mesos has a feature for linking to frameworks running on top of Mesos
from the Mesos WebUI. This commit enables Spark to make use of this
feature so one can directly visit the running Spark WebUIs from the
Mesos WebUI.

Author: ph <ph@plista.com>

Closes #9135 from philipphoffmann/SPARK-11129.
---
 .../cluster/mesos/CoarseMesosSchedulerBackend.scala        | 7 ++++++-
 .../scheduler/cluster/mesos/MesosSchedulerBackend.scala    | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
index 65cb5016cfcc9..d10a77f8e5c78 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/CoarseMesosSchedulerBackend.scala
@@ -127,7 +127,12 @@ private[spark] class CoarseMesosSchedulerBackend(
   override def start() {
     super.start()
     val driver = createSchedulerDriver(
-      master, CoarseMesosSchedulerBackend.this, sc.sparkUser, sc.appName, sc.conf)
+      master,
+      CoarseMesosSchedulerBackend.this,
+      sc.sparkUser,
+      sc.appName,
+      sc.conf,
+      sc.ui.map(_.appUIAddress))
     startScheduler(driver)
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 8edf7007a5daf..6196176c7cc33 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -68,7 +68,12 @@ private[spark] class MesosSchedulerBackend(
   override def start() {
     classLoader = Thread.currentThread.getContextClassLoader
     val driver = createSchedulerDriver(
-      master, MesosSchedulerBackend.this, sc.sparkUser, sc.appName, sc.conf)
+      master,
+      MesosSchedulerBackend.this,
+      sc.sparkUser,
+      sc.appName,
+      sc.conf,
+      sc.ui.map(_.appUIAddress))
     startScheduler(driver)
   }
 

From e2dfdbb2c0523517880138f214775f9a896f2271 Mon Sep 17 00:00:00 2001
From: huangzhaowei <carlmartinmax@gmail.com>
Date: Sat, 17 Oct 2015 16:41:49 -0700
Subject: [PATCH 089/139] [SPARK-11000] [YARN] Load `metadata.Hive` class only
 when `hive.metastore.uris` was set to avoid bootting the database twice

Author: huangzhaowei <carlmartinmax@gmail.com>

Closes #9026 from SaintBacchus/SPARK-11000.
---
 .../main/scala/org/apache/spark/deploy/yarn/Client.scala  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 9fcfe362a3ba2..08aecfa7f6fe0 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1327,11 +1327,8 @@ object Client extends Logging {
       val mirror = universe.runtimeMirror(getClass.getClassLoader)
 
       try {
-        val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
-        val hive = hiveClass.getMethod("get").invoke(null)
-
-        val hiveConf = hiveClass.getMethod("getConf").invoke(hive)
         val hiveConfClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.conf.HiveConf")
+        val hiveConf = hiveConfClass.newInstance()
 
         val hiveConfGet = (param: String) => Option(hiveConfClass
           .getMethod("get", classOf[java.lang.String])
@@ -1341,6 +1338,9 @@ object Client extends Logging {
 
         // Check for local metastore
         if (metastore_uri != None && metastore_uri.get.toString.size > 0) {
+          val hiveClass = mirror.classLoader.loadClass("org.apache.hadoop.hive.ql.metadata.Hive")
+          val hive = hiveClass.getMethod("get").invoke(null, hiveConf.asInstanceOf[Object])
+
           val metastore_kerberos_principal_conf_var = mirror.classLoader
             .loadClass("org.apache.hadoop.hive.conf.HiveConf$ConfVars")
             .getField("METASTORE_KERBEROS_PRINCIPAL").get("varname").toString

From 3895b2113a726171b3c9c04fe41b3cc93d6d14b5 Mon Sep 17 00:00:00 2001
From: tedyu <yuzhihong@gmail.com>
Date: Sun, 18 Oct 2015 02:12:56 -0700
Subject: [PATCH 090/139] [SPARK-11172] Close JsonParser/Generator in test

Author: tedyu <yuzhihong@gmail.com>

Closes #9157 from tedyu/master.
---
 .../sql/execution/datasources/json/JsonSuite.scala | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index b614e6c4148fd..7540223bf2771 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -47,13 +47,15 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
     val factory = new JsonFactory()
     def enforceCorrectType(value: Any, dataType: DataType): Any = {
       val writer = new StringWriter()
-      val generator = factory.createGenerator(writer)
-      generator.writeObject(value)
-      generator.flush()
+      Utils.tryWithResource(factory.createGenerator(writer)) { generator =>
+        generator.writeObject(value)
+        generator.flush()
+      }
 
-      val parser = factory.createParser(writer.toString)
-      parser.nextToken()
-      JacksonParser.convertField(factory, parser, dataType)
+      Utils.tryWithResource(factory.createParser(writer.toString)) { parser =>
+        parser.nextToken()
+        JacksonParser.convertField(factory, parser, dataType)
+      }
     }
 
     val intNumber: Int = 2147483647

From a112d69fdcd9f6d8805be6e0bc6d2211e26867c2 Mon Sep 17 00:00:00 2001
From: Lukasz Piepiora <lpiepiora@gmail.com>
Date: Sun, 18 Oct 2015 14:25:57 +0100
Subject: [PATCH 091/139] [SPARK-11174] [DOCS] Fix typo in the GraphX
 programming guide

This patch fixes a small typo in the GraphX programming guide

Author: Lukasz Piepiora <lpiepiora@gmail.com>

Closes #9160 from lpiepiora/11174-fix-typo-in-graphx-programming-guide.
---
 docs/graphx-programming-guide.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index c861a763d6222..6a512ab234bb2 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -944,7 +944,7 @@ The three additional functions exposed by the `EdgeRDD` are:
 {% highlight scala %}
 // Transform the edge attributes while preserving the structure
 def mapValues[ED2](f: Edge[ED] => ED2): EdgeRDD[ED2]
-// Revere the edges reusing both attributes and structure
+// Reverse the edges reusing both attributes and structure
 def reverse: EdgeRDD[ED]
 // Join two `EdgeRDD`s partitioned using the same partitioning strategy.
 def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3]

From 0480d6ca83d170618fa6a817ad64a2872438d47f Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Sun, 18 Oct 2015 09:54:38 -0700
Subject: [PATCH 092/139] [SPARK-11169] Remove the extra spaces in merge script

Our merge script now turns
```
[SPARK-1234][SPARK-1235][SPARK-1236][SQL] description
```
into
```
[SPARK-1234] [SPARK-1235] [SPARK-1236] [SQL] description
```
The extra spaces are more annoying in git since the first line of a git commit is supposed to be very short.

Doctest passes with the following command:
```
python -m doctest merge_spark_pr.py
```

Author: Reynold Xin <rxin@databricks.com>

Closes #9156 from rxin/SPARK-11169.
---
 dev/merge_spark_pr.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
index b9bdec3d70864..bf1a000f46791 100755
--- a/dev/merge_spark_pr.py
+++ b/dev/merge_spark_pr.py
@@ -300,24 +300,24 @@ def resolve_jira_issues(title, merge_branches, comment):
 def standardize_jira_ref(text):
     """
     Standardize the [SPARK-XXXXX] [MODULE] prefix
-    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX] [MLLIB] Issue"
+    Converts "[SPARK-XXX][mllib] Issue", "[MLLib] SPARK-XXX. Issue" or "SPARK XXX [MLLIB]: Issue" to "[SPARK-XXX][MLLIB] Issue"
 
     >>> standardize_jira_ref("[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful")
-    '[SPARK-5821] [SQL] ParquetRelation2 CTAS should check if delete is successful'
+    '[SPARK-5821][SQL] ParquetRelation2 CTAS should check if delete is successful'
     >>> standardize_jira_ref("[SPARK-4123][Project Infra][WIP]: Show new dependencies added in pull requests")
-    '[SPARK-4123] [PROJECT INFRA] [WIP] Show new dependencies added in pull requests'
+    '[SPARK-4123][PROJECT INFRA][WIP] Show new dependencies added in pull requests'
     >>> standardize_jira_ref("[MLlib] Spark  5954: Top by key")
-    '[SPARK-5954] [MLLIB] Top by key'
+    '[SPARK-5954][MLLIB] Top by key'
     >>> standardize_jira_ref("[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl")
     '[SPARK-979] a LRU scheduler for load balancing in TaskSchedulerImpl'
     >>> standardize_jira_ref("SPARK-1094 Support MiMa for reporting binary compatibility accross versions.")
     '[SPARK-1094] Support MiMa for reporting binary compatibility accross versions.'
     >>> standardize_jira_ref("[WIP]  [SPARK-1146] Vagrant support for Spark")
-    '[SPARK-1146] [WIP] Vagrant support for Spark'
+    '[SPARK-1146][WIP] Vagrant support for Spark'
     >>> standardize_jira_ref("SPARK-1032. If Yarn app fails before registering, app master stays aroun...")
     '[SPARK-1032] If Yarn app fails before registering, app master stays aroun...'
     >>> standardize_jira_ref("[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.")
-    '[SPARK-6250] [SPARK-6146] [SPARK-5911] [SQL] Types are now reserved words in DDL parser.'
+    '[SPARK-6250][SPARK-6146][SPARK-5911][SQL] Types are now reserved words in DDL parser.'
     >>> standardize_jira_ref("Additional information for users building from source code")
     'Additional information for users building from source code'
     """
@@ -325,7 +325,7 @@ def standardize_jira_ref(text):
     components = []
 
     # If the string is compliant, no need to process any further
-    if (re.search(r'^\[SPARK-[0-9]{3,6}\] (\[[A-Z0-9_\s,]+\] )+\S+', text)):
+    if (re.search(r'^\[SPARK-[0-9]{3,6}\](\[[A-Z0-9_\s,]+\] )+\S+', text)):
         return text
 
     # Extract JIRA ref(s):
@@ -348,7 +348,7 @@ def standardize_jira_ref(text):
         text = pattern.search(text).groups()[0]
 
     # Assemble full text (JIRA ref(s), module(s), remaining text)
-    clean_text = ' '.join(jira_refs).strip() + " " + ' '.join(components).strip() + " " + text.strip()
+    clean_text = ''.join(jira_refs).strip() + ''.join(components).strip() + " " + text.strip()
 
     # Replace multiple spaces with a single space, e.g. if no jira refs and/or components were included
     clean_text = re.sub(r'\s+', ' ', clean_text.strip())

From 8d4449c7f5d528410306c288a042c4594b81a881 Mon Sep 17 00:00:00 2001
From: Patrick Wendell <patrick@databricks.com>
Date: Sun, 18 Oct 2015 10:36:50 -0700
Subject: [PATCH 093/139] MAINTENANCE: Automated closing of pull requests.

This commit exists to close the following pull requests on Github:

Closes #8737 (close requested by 'srowen')
Closes #5323 (close requested by 'JoshRosen')
Closes #6148 (close requested by 'JoshRosen')
Closes #7557 (close requested by 'JoshRosen')
Closes #7047 (close requested by 'srowen')
Closes #8713 (close requested by 'marmbrus')
Closes #5834 (close requested by 'srowen')
Closes #7467 (close requested by 'tdas')
Closes #8943 (close requested by 'xiaowen147')
Closes #4434 (close requested by 'JoshRosen')
Closes #8949 (close requested by 'srowen')
Closes #5368 (close requested by 'JoshRosen')
Closes #8186 (close requested by 'marmbrus')
Closes #5147 (close requested by 'JoshRosen')

From a337c235a12d4ea6a7d6db457acc6b32f1915241 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Sun, 18 Oct 2015 11:39:19 -0700
Subject: [PATCH 094/139] [SPARK-11158][SQL] Modified _verify_type() to be more
 informative on Errors by presenting the Object

The _verify_type() function had Errors that were raised when there were Type conversion issues but left out the Object in question. The Object is now added in the Error to reduce the strain on the user to debug through to figure out the Object that failed the Type conversion.

The use case for me was a Pandas DataFrame that contained 'nan' as values for columns of Strings.

Author: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Author: Mahmoud Lababidi <lababidi@gmail.com>

Closes #9149 from lababidi/master.
---
 python/pyspark/sql/types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 1f86894855cbe..5bc0773fa8660 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -1127,15 +1127,15 @@ def _verify_type(obj, dataType):
         return
 
     _type = type(dataType)
-    assert _type in _acceptable_types, "unknown datatype: %s" % dataType
+    assert _type in _acceptable_types, "unknown datatype: %s for object %r" % (dataType, obj)
 
     if _type is StructType:
         if not isinstance(obj, (tuple, list)):
-            raise TypeError("StructType can not accept object in type %s" % type(obj))
+            raise TypeError("StructType can not accept object %r in type %s" % (obj, type(obj)))
     else:
         # subclass of them can not be fromInternald in JVM
         if type(obj) not in _acceptable_types[_type]:
-            raise TypeError("%s can not accept object in type %s" % (dataType, type(obj)))
+            raise TypeError("%s can not accept object %r in type %s" % (dataType, obj, type(obj)))
 
     if isinstance(dataType, ArrayType):
         for i in obj:

From 94c8fef296e5cdac9a93ed34acc079e51839caa7 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Sun, 18 Oct 2015 13:51:45 -0700
Subject: [PATCH 095/139] [SPARK-11126][SQL] Fix a memory leak in
 SQLListener._stageIdToStageMetrics

SQLListener adds all stage infos to `_stageIdToStageMetrics`, but only removes stage infos belonging to SQL executions. This PR fixed it by ignoring stages that don't belong to SQL executions.

Reported by Terry Hoo in https://www.mail-archive.com/userspark.apache.org/msg38810.html

Author: zsxwing <zsxwing@gmail.com>

Closes #9132 from zsxwing/SPARK-11126.
---
 .../spark/sql/execution/ui/SQLListener.scala   |  8 +++++++-
 .../sql/execution/ui/SQLListenerSuite.scala    | 18 ++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
index b302b519998ac..5a072de400b6a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala
@@ -126,7 +126,13 @@ private[sql] class SQLListener(conf: SparkConf) extends SparkListener with Loggi
     val stageId = stageSubmitted.stageInfo.stageId
     val stageAttemptId = stageSubmitted.stageInfo.attemptId
     // Always override metrics for old stage attempt
-    _stageIdToStageMetrics(stageId) = new SQLStageMetrics(stageAttemptId)
+    if (_stageIdToStageMetrics.contains(stageId)) {
+      _stageIdToStageMetrics(stageId) = new SQLStageMetrics(stageAttemptId)
+    } else {
+      // If a stage belongs to some SQL execution, its stageId will be put in "onJobStart".
+      // Since "_stageIdToStageMetrics" doesn't contain it, it must not belong to any SQL execution.
+      // So we can ignore it. Otherwise, this may lead to memory leaks (SPARK-11126).
+    }
   }
 
   override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index cc1c1e10e98c4..03bcee94a2b91 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -313,7 +313,22 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
     assert(executionUIData.failedJobs === Seq(0))
   }
 
-  ignore("no memory leak") {
+  test("SPARK-11126: no memory leak when running non SQL jobs") {
+    val previousStageNumber = sqlContext.listener.stageIdToStageMetrics.size
+    sqlContext.sparkContext.parallelize(1 to 10).foreach(i => ())
+    // listener should ignore the non SQL stage
+    assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber)
+
+    sqlContext.sparkContext.parallelize(1 to 10).toDF().foreach(i => ())
+    // listener should save the SQL stage
+    assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber + 1)
+  }
+
+}
+
+class SQLListenerMemoryLeakSuite extends SparkFunSuite {
+
+  test("no memory leak") {
     val conf = new SparkConf()
       .setMaster("local")
       .setAppName("test")
@@ -348,5 +363,4 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
       sc.stop()
     }
   }
-
 }

From d3180c25d8cf0899a7238e7d24b35c5ae918cc1d Mon Sep 17 00:00:00 2001
From: Brennon York <brennon.york@capitalone.com>
Date: Sun, 18 Oct 2015 22:45:14 -0700
Subject: [PATCH 096/139] [SPARK-7018][BUILD] Refactor dev/run-tests-jenkins
 into Python

This commit refactors the `run-tests-jenkins` script into Python. This refactoring was done by brennonyork in #7401; this PR contains a few minor edits from joshrosen in order to bring it up to date with other recent changes.

From the original PR description (by brennonyork):

Currently a few things are left out that, could and I think should, be smaller JIRA's after this.

1. There are still a few areas where we use environment variables where we don't need to (like `CURRENT_BLOCK`). I might get around to fixing this one in lieu of everything else, but wanted to point that out.
2. The PR tests are still written in bash. I opted to not change those and just rewrite the runner into Python. This is a great follow-on JIRA IMO.
3. All of the linting scripts are still in bash as well and would likely do to just add those in as follow-on JIRA's as well.

Closes #7401.

Author: Brennon York <brennon.york@capitalone.com>

Closes #9161 from JoshRosen/run-tests-jenkins-refactoring.
---
 dev/lint-python                    |   2 +-
 dev/run-tests-codes.sh             |  30 ----
 dev/run-tests-jenkins              | 204 +-------------------------
 dev/run-tests-jenkins.py           | 228 +++++++++++++++++++++++++++++
 dev/run-tests.py                   |  20 +--
 dev/sparktestsupport/__init__.py   |  14 ++
 dev/sparktestsupport/shellutils.py |  37 ++++-
 python/run-tests.py                |  19 +--
 8 files changed, 285 insertions(+), 269 deletions(-)
 delete mode 100644 dev/run-tests-codes.sh
 create mode 100755 dev/run-tests-jenkins.py

diff --git a/dev/lint-python b/dev/lint-python
index 575dbb0ae321b..0b97213ae3dff 100755
--- a/dev/lint-python
+++ b/dev/lint-python
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 SPARK_ROOT_DIR="$(dirname "$SCRIPT_DIR")"
 PATHS_TO_CHECK="./python/pyspark/ ./ec2/spark_ec2.py ./examples/src/main/python/ ./dev/sparktestsupport"
-PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py"
+PATHS_TO_CHECK="$PATHS_TO_CHECK ./dev/run-tests.py ./python/run-tests.py ./dev/run-tests-jenkins.py"
 PEP8_REPORT_PATH="$SPARK_ROOT_DIR/dev/pep8-report.txt"
 PYLINT_REPORT_PATH="$SPARK_ROOT_DIR/dev/pylint-report.txt"
 PYLINT_INSTALL_INFO="$SPARK_ROOT_DIR/dev/pylint-info.txt"
diff --git a/dev/run-tests-codes.sh b/dev/run-tests-codes.sh
deleted file mode 100644
index 1f16790522e76..0000000000000
--- a/dev/run-tests-codes.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-readonly BLOCK_GENERAL=10
-readonly BLOCK_RAT=11
-readonly BLOCK_SCALA_STYLE=12
-readonly BLOCK_PYTHON_STYLE=13
-readonly BLOCK_R_STYLE=14
-readonly BLOCK_DOCUMENTATION=15
-readonly BLOCK_BUILD=16
-readonly BLOCK_MIMA=17
-readonly BLOCK_SPARK_UNIT_TESTS=18
-readonly BLOCK_PYSPARK_UNIT_TESTS=19
-readonly BLOCK_SPARKR_UNIT_TESTS=20
diff --git a/dev/run-tests-jenkins b/dev/run-tests-jenkins
index d3b05fa6df0ce..e79accf9e987a 100755
--- a/dev/run-tests-jenkins
+++ b/dev/run-tests-jenkins
@@ -22,207 +22,7 @@
 # Environment variables are populated by the code here:
 #+ https://github.com/jenkinsci/ghprb-plugin/blob/master/src/main/java/org/jenkinsci/plugins/ghprb/GhprbTrigger.java#L139
 
-# Go to the Spark project root directory
-FWDIR="$(cd `dirname $0`/..; pwd)"
+FWDIR="$(cd "`dirname $0`"/..; pwd)"
 cd "$FWDIR"
 
-source "$FWDIR/dev/run-tests-codes.sh"
-
-COMMENTS_URL="https://api.github.com/repos/apache/spark/issues/$ghprbPullId/comments"
-PULL_REQUEST_URL="https://github.com/apache/spark/pull/$ghprbPullId"
-
-# Important Environment Variables
-# ---
-# $ghprbActualCommit
-#+  This is the hash of the most recent commit in the PR.
-#+  The merge-base of this and master is the commit from which the PR was branched.
-# $sha1
-#+  If the patch merges cleanly, this is a reference to the merge commit hash
-#+    (e.g. "origin/pr/2606/merge").
-#+  If the patch does not merge cleanly, it is equal to $ghprbActualCommit.
-#+  The merge-base of this and master in the case of a clean merge is the most recent commit
-#+    against master.
-
-COMMIT_URL="https://github.com/apache/spark/commit/${ghprbActualCommit}"
-# GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
-SHORT_COMMIT_HASH="${ghprbActualCommit:0:7}"
-
-# format: http://linux.die.net/man/1/timeout
-# must be less than the timeout configured on Jenkins (currently 300m)
-TESTS_TIMEOUT="250m"
-
-# Array to capture all tests to run on the pull request. These tests are held under the
-#+ dev/tests/ directory.
-#
-# To write a PR test:
-#+  * the file must reside within the dev/tests directory
-#+  * be an executable bash script
-#+  * accept three arguments on the command line, the first being the Github PR long commit
-#+    hash, the second the Github SHA1 hash, and the final the current PR hash
-#+  * and, lastly, return string output to be included in the pr message output that will
-#+    be posted to Github
-PR_TESTS=(
-  "pr_merge_ability"
-  "pr_public_classes"
-# DISABLED (pwendell) "pr_new_dependencies"
-)
-
-function post_message () {
-  local message=$1
-  local data="{\"body\": \"$message\"}"
-  local HTTP_CODE_HEADER="HTTP Response Code: "
-
-  echo "Attempting to post to Github..."
-
-  local curl_output=$(
-    curl `#--dump-header -` \
-      --silent \
-      --user x-oauth-basic:$GITHUB_OAUTH_KEY \
-      --request POST \
-      --data "$data" \
-      --write-out "${HTTP_CODE_HEADER}%{http_code}\n" \
-      --header "Content-Type: application/json" \
-      "$COMMENTS_URL" #> /dev/null #| "$FWDIR/dev/jq" .id #| head -n 8
-  )
-  local curl_status=${PIPESTATUS[0]}
-
-  if [ "$curl_status" -ne 0 ]; then
-      echo "Failed to post message to GitHub." >&2
-      echo " > curl_status: ${curl_status}" >&2
-      echo " > curl_output: ${curl_output}" >&2
-      echo " > data: ${data}" >&2
-      # exit $curl_status
-  fi
-
-  local api_response=$(
-    echo "${curl_output}" \
-    | grep -v -e "^${HTTP_CODE_HEADER}"
-  )
-
-  local http_code=$(
-    echo "${curl_output}" \
-    | grep -e "^${HTTP_CODE_HEADER}" \
-    | sed -r -e "s/^${HTTP_CODE_HEADER}//g"
-  )
-
-  if [ -n "$http_code" ] && [ "$http_code" -ne "201" ]; then
-      echo " > http_code: ${http_code}." >&2
-      echo " > api_response: ${api_response}" >&2
-      echo " > data: ${data}" >&2
-  fi
-
-  if [ "$curl_status" -eq 0 ] && [ "$http_code" -eq "201" ]; then
-    echo " > Post successful."
-  fi
-}
-
-# post start message
-{
-  start_message="\
-  [Test build ${BUILD_DISPLAY_NAME} has started](${BUILD_URL}consoleFull) for \
-  PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL})."
-
-  post_message "$start_message"
-}
-
-# Environment variable to capture PR test output
-pr_message=""
-# Ensure we save off the current HEAD to revert to
-current_pr_head="`git rev-parse HEAD`"
-
-echo "HEAD:  `git rev-parse HEAD`"
-echo "\$ghprbActualCommit: $ghprbActualCommit"
-echo "\$sha1:  $sha1"
-echo "\$ghprbPullTitle: $ghprbPullTitle"
-
-# Run pull request tests
-for t in "${PR_TESTS[@]}"; do
-  this_test="${FWDIR}/dev/tests/${t}.sh"
-  # Ensure the test can be found and is a file
-  if [ -f "${this_test}" ]; then
-    echo "Running test: $t"
-    this_mssg="$(bash "${this_test}" "${ghprbActualCommit}" "${sha1}" "${current_pr_head}")"
-    # Check if this is the merge test as we submit that note *before* and *after*
-    # the tests run
-    [ "$t" == "pr_merge_ability" ] && merge_note="${this_mssg}"
-    pr_message="${pr_message}\n${this_mssg}"
-    # Ensure, after each test, that we're back on the current PR
-    git checkout -f "${current_pr_head}" &>/dev/null
-  else
-    echo "Cannot find test ${this_test}."
-  fi
-done
-
-# run tests
-{
-  # Marks this build is a pull request build.
-  export AMP_JENKINS_PRB=true
-  if [[ $ghprbPullTitle == *"test-maven"* ]]; then
-    export AMPLAB_JENKINS_BUILD_TOOL="maven"
-  fi
-  if [[ $ghprbPullTitle == *"test-hadoop1.0"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop1.0"
-  elif [[ $ghprbPullTitle == *"test-hadoop2.0"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.0"
-  elif [[ $ghprbPullTitle == *"test-hadoop2.2"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.2"
-  elif [[ $ghprbPullTitle == *"test-hadoop2.3"* ]]; then
-    export AMPLAB_JENKINS_BUILD_PROFILE="hadoop2.3"
-  fi
-
-  timeout "${TESTS_TIMEOUT}" ./dev/run-tests
-  test_result="$?"
-
-  if [ "$test_result" -eq "124" ]; then
-    fail_message="**[Test build ${BUILD_DISPLAY_NAME} timed out](${BUILD_URL}console)** \
-    for PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL}) \
-    after a configured wait of \`${TESTS_TIMEOUT}\`."
-
-    post_message "$fail_message"
-    exit $test_result
-  elif [ "$test_result" -eq "0" ]; then
-    test_result_note=" * This patch **passes all tests**."
-  else
-    if [ "$test_result" -eq "$BLOCK_GENERAL" ]; then
-      failing_test="some tests"
-    elif [ "$test_result" -eq "$BLOCK_RAT" ]; then
-      failing_test="RAT tests"
-    elif [ "$test_result" -eq "$BLOCK_SCALA_STYLE" ]; then
-      failing_test="Scala style tests"
-    elif [ "$test_result" -eq "$BLOCK_PYTHON_STYLE" ]; then
-      failing_test="Python style tests"
-    elif [ "$test_result" -eq "$BLOCK_R_STYLE" ]; then
-      failing_test="R style tests"
-    elif [ "$test_result" -eq "$BLOCK_DOCUMENTATION" ]; then
-      failing_test="to generate documentation"
-    elif [ "$test_result" -eq "$BLOCK_BUILD" ]; then
-      failing_test="to build"
-    elif [ "$test_result" -eq "$BLOCK_MIMA" ]; then
-      failing_test="MiMa tests"
-    elif [ "$test_result" -eq "$BLOCK_SPARK_UNIT_TESTS" ]; then
-      failing_test="Spark unit tests"
-    elif [ "$test_result" -eq "$BLOCK_PYSPARK_UNIT_TESTS" ]; then
-      failing_test="PySpark unit tests"
-    elif [ "$test_result" -eq "$BLOCK_SPARKR_UNIT_TESTS" ]; then
-      failing_test="SparkR unit tests"
-    else
-      failing_test="some tests"
-    fi
-
-    test_result_note=" * This patch **fails $failing_test**."
-  fi
-}
-
-# post end message
-{
-  result_message="\
-  [Test build ${BUILD_DISPLAY_NAME} has finished](${BUILD_URL}console) for \
-  PR $ghprbPullId at commit [\`${SHORT_COMMIT_HASH}\`](${COMMIT_URL})."
-
-  result_message="${result_message}\n${test_result_note}"
-  result_message="${result_message}${pr_message}"
-
-  post_message "$result_message"
-}
-
-exit $test_result
+exec python -u ./dev/run-tests-jenkins.py "$@"
diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
new file mode 100755
index 0000000000000..623004310e189
--- /dev/null
+++ b/dev/run-tests-jenkins.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python2
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import os
+import sys
+import json
+import urllib2
+import functools
+import subprocess
+
+from sparktestsupport import SPARK_HOME, ERROR_CODES
+from sparktestsupport.shellutils import run_cmd
+
+
+def print_err(msg):
+    """
+    Given a set of arguments, will print them to the STDERR stream
+    """
+    print(msg, file=sys.stderr)
+
+
+def post_message_to_github(msg, ghprb_pull_id):
+    print("Attempting to post to Github...")
+
+    url = "https://api.github.com/repos/apache/spark/issues/" + ghprb_pull_id + "/comments"
+    github_oauth_key = os.environ["GITHUB_OAUTH_KEY"]
+
+    posted_message = json.dumps({"body": msg})
+    request = urllib2.Request(url,
+                              headers={
+                                  "Authorization": "token %s" % github_oauth_key,
+                                  "Content-Type": "application/json"
+                              },
+                              data=posted_message)
+    try:
+        response = urllib2.urlopen(request)
+
+        if response.getcode() == 201:
+            print(" > Post successful.")
+    except urllib2.HTTPError as http_e:
+        print_err("Failed to post message to Github.")
+        print_err(" > http_code: %s" % http_e.code)
+        print_err(" > api_response: %s" % http_e.read())
+        print_err(" > data: %s" % posted_message)
+    except urllib2.URLError as url_e:
+        print_err("Failed to post message to Github.")
+        print_err(" > urllib2_status: %s" % url_e.reason[1])
+        print_err(" > data: %s" % posted_message)
+
+
+def pr_message(build_display_name,
+               build_url,
+               ghprb_pull_id,
+               short_commit_hash,
+               commit_url,
+               msg,
+               post_msg=''):
+    # align the arguments properly for string formatting
+    str_args = (build_display_name,
+                msg,
+                build_url,
+                ghprb_pull_id,
+                short_commit_hash,
+                commit_url,
+                str(' ' + post_msg + '.') if post_msg else '.')
+    return '**[Test build %s %s](%sconsoleFull)** for PR %s at commit [`%s`](%s)%s' % str_args
+
+
+def run_pr_checks(pr_tests, ghprb_actual_commit, sha1):
+    """
+    Executes a set of pull request checks to ease development and report issues with various
+    components such as style, linting, dependencies, compatibilities, etc.
+    @return a list of messages to post back to Github
+    """
+    # Ensure we save off the current HEAD to revert to
+    current_pr_head = run_cmd(['git', 'rev-parse', 'HEAD'], return_output=True).strip()
+    pr_results = list()
+
+    for pr_test in pr_tests:
+        test_name = pr_test + '.sh'
+        pr_results.append(run_cmd(['bash', os.path.join(SPARK_HOME, 'dev', 'tests', test_name),
+                                   ghprb_actual_commit, sha1],
+                                  return_output=True).rstrip())
+        # Ensure, after each test, that we're back on the current PR
+        run_cmd(['git', 'checkout', '-f', current_pr_head])
+    return pr_results
+
+
+def run_tests(tests_timeout):
+    """
+    Runs the `dev/run-tests` script and responds with the correct error message
+    under the various failure scenarios.
+    @return a tuple containing the test result code and the result note to post to Github
+    """
+
+    test_result_code = subprocess.Popen(['timeout',
+                                         tests_timeout,
+                                         os.path.join(SPARK_HOME, 'dev', 'run-tests')]).wait()
+
+    failure_note_by_errcode = {
+        1: 'executing the `dev/run-tests` script',  # error to denote run-tests script failures
+        ERROR_CODES["BLOCK_GENERAL"]: 'some tests',
+        ERROR_CODES["BLOCK_RAT"]: 'RAT tests',
+        ERROR_CODES["BLOCK_SCALA_STYLE"]: 'Scala style tests',
+        ERROR_CODES["BLOCK_PYTHON_STYLE"]: 'Python style tests',
+        ERROR_CODES["BLOCK_R_STYLE"]: 'R style tests',
+        ERROR_CODES["BLOCK_DOCUMENTATION"]: 'to generate documentation',
+        ERROR_CODES["BLOCK_BUILD"]: 'to build',
+        ERROR_CODES["BLOCK_MIMA"]: 'MiMa tests',
+        ERROR_CODES["BLOCK_SPARK_UNIT_TESTS"]: 'Spark unit tests',
+        ERROR_CODES["BLOCK_PYSPARK_UNIT_TESTS"]: 'PySpark unit tests',
+        ERROR_CODES["BLOCK_SPARKR_UNIT_TESTS"]: 'SparkR unit tests',
+        ERROR_CODES["BLOCK_TIMEOUT"]: 'from timeout after a configured wait of \`%s\`' % (
+            tests_timeout)
+    }
+
+    if test_result_code == 0:
+        test_result_note = ' * This patch passes all tests.'
+    else:
+        test_result_note = ' * This patch **fails %s**.' % failure_note_by_errcode[test_result_code]
+
+    return [test_result_code, test_result_note]
+
+
+def main():
+    # Important Environment Variables
+    # ---
+    # $ghprbActualCommit
+    #   This is the hash of the most recent commit in the PR.
+    #   The merge-base of this and master is the commit from which the PR was branched.
+    # $sha1
+    #   If the patch merges cleanly, this is a reference to the merge commit hash
+    #     (e.g. "origin/pr/2606/merge").
+    #   If the patch does not merge cleanly, it is equal to $ghprbActualCommit.
+    #   The merge-base of this and master in the case of a clean merge is the most recent commit
+    #     against master.
+    ghprb_pull_id = os.environ["ghprbPullId"]
+    ghprb_actual_commit = os.environ["ghprbActualCommit"]
+    ghprb_pull_title = os.environ["ghprbPullTitle"]
+    sha1 = os.environ["sha1"]
+
+    # Marks this build as a pull request build.
+    os.environ["AMP_JENKINS_PRB"] = "true"
+    # Switch to a Maven-based build if the PR title contains "test-maven":
+    if "test-maven" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_TOOL"] = "maven"
+    # Switch the Hadoop profile based on the PR title:
+    if "test-hadoop1.0" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop1.0"
+    if "test-hadoop2.2" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.0"
+    if "test-hadoop2.2" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.2"
+    if "test-hadoop2.3" in ghprb_pull_title:
+        os.environ["AMPLAB_JENKINS_BUILD_PROFILE"] = "hadoop2.3"
+
+    build_display_name = os.environ["BUILD_DISPLAY_NAME"]
+    build_url = os.environ["BUILD_URL"]
+
+    commit_url = "https://github.com/apache/spark/commit/" + ghprb_actual_commit
+
+    # GitHub doesn't auto-link short hashes when submitted via the API, unfortunately. :(
+    short_commit_hash = ghprb_actual_commit[0:7]
+
+    # format: http://linux.die.net/man/1/timeout
+    # must be less than the timeout configured on Jenkins (currently 300m)
+    tests_timeout = "250m"
+
+    # Array to capture all test names to run on the pull request. These tests are represented
+    # by their file equivalents in the dev/tests/ directory.
+    #
+    # To write a PR test:
+    #   * the file must reside within the dev/tests directory
+    #   * be an executable bash script
+    #   * accept three arguments on the command line, the first being the Github PR long commit
+    #     hash, the second the Github SHA1 hash, and the final the current PR hash
+    #   * and, lastly, return string output to be included in the pr message output that will
+    #     be posted to Github
+    pr_tests = [
+        "pr_merge_ability",
+        "pr_public_classes"
+        # DISABLED (pwendell) "pr_new_dependencies"
+    ]
+
+    # `bind_message_base` returns a function to generate messages for Github posting
+    github_message = functools.partial(pr_message,
+                                       build_display_name,
+                                       build_url,
+                                       ghprb_pull_id,
+                                       short_commit_hash,
+                                       commit_url)
+
+    # post start message
+    post_message_to_github(github_message('has started'), ghprb_pull_id)
+
+    pr_check_results = run_pr_checks(pr_tests, ghprb_actual_commit, sha1)
+
+    test_result_code, test_result_note = run_tests(tests_timeout)
+
+    # post end message
+    result_message = github_message('has finished')
+    result_message += '\n' + test_result_note + '\n'
+    result_message += '\n'.join(pr_check_results)
+
+    post_message_to_github(result_message, ghprb_pull_id)
+
+    sys.exit(test_result_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dev/run-tests.py b/dev/run-tests.py
index d4d6880491bc8..6b4b71073453d 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -27,10 +27,11 @@
 import subprocess
 from collections import namedtuple
 
-from sparktestsupport import SPARK_HOME, USER_HOME
+from sparktestsupport import SPARK_HOME, USER_HOME, ERROR_CODES
 from sparktestsupport.shellutils import exit_from_command_with_retcode, run_cmd, rm_r, which
 import sparktestsupport.modules as modules
 
+
 # -------------------------------------------------------------------------------------------------
 # Functions for traversing module dependency graph
 # -------------------------------------------------------------------------------------------------
@@ -130,19 +131,6 @@ def determine_tags_to_exclude(changed_modules):
 # Functions for working with subprocesses and shell tools
 # -------------------------------------------------------------------------------------------------
 
-def get_error_codes(err_code_file):
-    """Function to retrieve all block numbers from the `run-tests-codes.sh`
-    file to maintain backwards compatibility with the `run-tests-jenkins`
-    script"""
-
-    with open(err_code_file, 'r') as f:
-        err_codes = [e.split()[1].strip().split('=')
-                     for e in f if e.startswith("readonly")]
-        return dict(err_codes)
-
-
-ERROR_CODES = get_error_codes(os.path.join(SPARK_HOME, "dev/run-tests-codes.sh"))
-
 
 def determine_java_executable():
     """Will return the path of the java executable that will be used by Spark's
@@ -191,7 +179,7 @@ def determine_java_version(java_exe):
 
 
 def set_title_and_block(title, err_block):
-    os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
+    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES[err_block])
     line_str = '=' * 72
 
     print('')
@@ -467,7 +455,7 @@ def main():
     rm_r(os.path.join(USER_HOME, ".ivy2", "local", "org.apache.spark"))
     rm_r(os.path.join(USER_HOME, ".ivy2", "cache", "org.apache.spark"))
 
-    os.environ["CURRENT_BLOCK"] = ERROR_CODES["BLOCK_GENERAL"]
+    os.environ["CURRENT_BLOCK"] = str(ERROR_CODES["BLOCK_GENERAL"])
 
     java_exe = determine_java_executable()
 
diff --git a/dev/sparktestsupport/__init__.py b/dev/sparktestsupport/__init__.py
index 12696d98fb988..8ab6d9e37ca2f 100644
--- a/dev/sparktestsupport/__init__.py
+++ b/dev/sparktestsupport/__init__.py
@@ -19,3 +19,17 @@
 
 SPARK_HOME = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../"))
 USER_HOME = os.environ.get("HOME")
+ERROR_CODES = {
+    "BLOCK_GENERAL": 10,
+    "BLOCK_RAT": 11,
+    "BLOCK_SCALA_STYLE": 12,
+    "BLOCK_PYTHON_STYLE": 13,
+    "BLOCK_R_STYLE": 14,
+    "BLOCK_DOCUMENTATION": 15,
+    "BLOCK_BUILD": 16,
+    "BLOCK_MIMA": 17,
+    "BLOCK_SPARK_UNIT_TESTS": 18,
+    "BLOCK_PYSPARK_UNIT_TESTS": 19,
+    "BLOCK_SPARKR_UNIT_TESTS": 20,
+    "BLOCK_TIMEOUT": 124
+}
diff --git a/dev/sparktestsupport/shellutils.py b/dev/sparktestsupport/shellutils.py
index 12bd0bf3a4fe9..d280e797077d1 100644
--- a/dev/sparktestsupport/shellutils.py
+++ b/dev/sparktestsupport/shellutils.py
@@ -22,6 +22,36 @@
 import sys
 
 
+if sys.version_info >= (2, 7):
+    subprocess_check_output = subprocess.check_output
+    subprocess_check_call = subprocess.check_call
+else:
+    # SPARK-8763
+    # backported from subprocess module in Python 2.7
+    def subprocess_check_output(*popenargs, **kwargs):
+        if 'stdout' in kwargs:
+            raise ValueError('stdout argument not allowed, it will be overridden.')
+        process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
+        output, unused_err = process.communicate()
+        retcode = process.poll()
+        if retcode:
+            cmd = kwargs.get("args")
+            if cmd is None:
+                cmd = popenargs[0]
+            raise subprocess.CalledProcessError(retcode, cmd, output=output)
+        return output
+
+    # backported from subprocess module in Python 2.7
+    def subprocess_check_call(*popenargs, **kwargs):
+        retcode = call(*popenargs, **kwargs)
+        if retcode:
+            cmd = kwargs.get("args")
+            if cmd is None:
+                cmd = popenargs[0]
+            raise CalledProcessError(retcode, cmd)
+        return 0
+
+
 def exit_from_command_with_retcode(cmd, retcode):
     print("[error] running", ' '.join(cmd), "; received return code", retcode)
     sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
@@ -39,7 +69,7 @@ def rm_r(path):
         os.remove(path)
 
 
-def run_cmd(cmd):
+def run_cmd(cmd, return_output=False):
     """
     Given a command as a list of arguments will attempt to execute the command
     and, on failure, print an error message and exit.
@@ -48,7 +78,10 @@ def run_cmd(cmd):
     if not isinstance(cmd, list):
         cmd = cmd.split()
     try:
-        subprocess.check_call(cmd)
+        if return_output:
+            return subprocess_check_output(cmd)
+        else:
+            return subprocess_check_call(cmd)
     except subprocess.CalledProcessError as e:
         exit_from_command_with_retcode(e.cmd, e.returncode)
 
diff --git a/python/run-tests.py b/python/run-tests.py
index 152f5cc98d0fd..f5857f8c62214 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -31,23 +31,6 @@
     import Queue
 else:
     import queue as Queue
-if sys.version_info >= (2, 7):
-    subprocess_check_output = subprocess.check_output
-else:
-    # SPARK-8763
-    # backported from subprocess module in Python 2.7
-    def subprocess_check_output(*popenargs, **kwargs):
-        if 'stdout' in kwargs:
-            raise ValueError('stdout argument not allowed, it will be overridden.')
-        process = subprocess.Popen(stdout=subprocess.PIPE, *popenargs, **kwargs)
-        output, unused_err = process.communicate()
-        retcode = process.poll()
-        if retcode:
-            cmd = kwargs.get("args")
-            if cmd is None:
-                cmd = popenargs[0]
-            raise subprocess.CalledProcessError(retcode, cmd, output=output)
-        return output
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -55,7 +38,7 @@ def subprocess_check_output(*popenargs, **kwargs):
 
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
-from sparktestsupport.shellutils import which  # noqa
+from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
 from sparktestsupport.modules import all_modules  # noqa
 
 

From beb8bc1ea588b7f9ab7effff707c0f784421364d Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 19 Oct 2015 00:06:51 -0700
Subject: [PATCH 097/139] [SPARK-11126][SQL] Fix the potential flaky test

The unit test added in #9132 is flaky. This is a follow up PR to add `listenerBus.waitUntilEmpty` to fix it.

Author: zsxwing <zsxwing@gmail.com>

Closes #9163 from zsxwing/SPARK-11126-follow-up.
---
 .../org/apache/spark/sql/execution/ui/SQLListenerSuite.scala    | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
index 03bcee94a2b91..c15aac775096c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/ui/SQLListenerSuite.scala
@@ -316,10 +316,12 @@ class SQLListenerSuite extends SparkFunSuite with SharedSQLContext {
   test("SPARK-11126: no memory leak when running non SQL jobs") {
     val previousStageNumber = sqlContext.listener.stageIdToStageMetrics.size
     sqlContext.sparkContext.parallelize(1 to 10).foreach(i => ())
+    sqlContext.sparkContext.listenerBus.waitUntilEmpty(10000)
     // listener should ignore the non SQL stage
     assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber)
 
     sqlContext.sparkContext.parallelize(1 to 10).toDF().foreach(i => ())
+    sqlContext.sparkContext.listenerBus.waitUntilEmpty(10000)
     // listener should save the SQL stage
     assert(sqlContext.listener.stageIdToStageMetrics.size == previousStageNumber + 1)
   }

From bd64c2d550c36405f9be25a5c6a8eaa54bf4e7e7 Mon Sep 17 00:00:00 2001
From: Jacek Laskowski <jacek.laskowski@deepsense.io>
Date: Mon, 19 Oct 2015 09:59:18 +0100
Subject: [PATCH 098/139] =?UTF-8?q?[SPARK-10921][YARN]=20Completely=20remo?=
 =?UTF-8?q?ve=20the=20use=20of=20SparkContext.prefer=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…redNodeLocationData

Author: Jacek Laskowski <jacek.laskowski@deepsense.io>

Closes #8976 from jaceklaskowski/SPARK-10921.
---
 .../scala/org/apache/spark/SparkContext.scala | 22 +++++--------------
 project/MimaExcludes.scala                    |  3 +++
 .../spark/deploy/yarn/ApplicationMaster.scala |  1 -
 .../spark/deploy/yarn/YarnRMClient.scala      |  2 --
 4 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 0c72adfb9505b..ccba3ed9e643c 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -90,11 +90,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
   // NOTE: this must be placed at the beginning of the SparkContext constructor.
   SparkContext.markPartiallyConstructed(this, allowMultipleContexts)
 
-  // This is used only by YARN for now, but should be relevant to other cluster types (Mesos,
-  // etc) too. This is typically generated from InputFormatInfo.computePreferredLocations. It
-  // contains a map from hostname to a list of input format splits on the host.
-  private[spark] var preferredNodeLocationData: Map[String, Set[SplitInfo]] = Map()
-
   val startTime = System.currentTimeMillis()
 
   private[spark] val stopped: AtomicBoolean = new AtomicBoolean(false)
@@ -116,16 +111,13 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * Alternative constructor for setting preferred locations where Spark will create executors.
    *
    * @param config a [[org.apache.spark.SparkConf]] object specifying other Spark parameters
-   * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on.
-   * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
-   * from a list of input files or InputFormats for the application.
+   * @param preferredNodeLocationData not used. Left for backward compatibility.
    */
   @deprecated("Passing in preferred locations has no effect at all, see SPARK-8949", "1.5.0")
   @DeveloperApi
   def this(config: SparkConf, preferredNodeLocationData: Map[String, Set[SplitInfo]]) = {
     this(config)
     logWarning("Passing in preferred locations has no effect at all, see SPARK-8949")
-    this.preferredNodeLocationData = preferredNodeLocationData
   }
 
   /**
@@ -147,10 +139,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @param jars Collection of JARs to send to the cluster. These can be paths on the local file
    *             system or HDFS, HTTP, HTTPS, or FTP URLs.
    * @param environment Environment variables to set on worker nodes.
-   * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on.
-   * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]]
-   * from a list of input files or InputFormats for the application.
+   * @param preferredNodeLocationData not used. Left for backward compatibility.
    */
+  @deprecated("Passing in preferred locations has no effect at all, see SPARK-10921", "1.6.0")
   def this(
       master: String,
       appName: String,
@@ -163,7 +154,6 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
     if (preferredNodeLocationData.nonEmpty) {
       logWarning("Passing in preferred locations has no effect at all, see SPARK-8949")
     }
-    this.preferredNodeLocationData = preferredNodeLocationData
   }
 
   // NOTE: The below constructors could be consolidated using default arguments. Due to
@@ -177,7 +167,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @param appName A name for your application, to display on the cluster web UI.
    */
   private[spark] def this(master: String, appName: String) =
-    this(master, appName, null, Nil, Map(), Map())
+    this(master, appName, null, Nil, Map())
 
   /**
    * Alternative constructor that allows setting common Spark properties directly
@@ -187,7 +177,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    * @param sparkHome Location where Spark is installed on cluster nodes.
    */
   private[spark] def this(master: String, appName: String, sparkHome: String) =
-    this(master, appName, sparkHome, Nil, Map(), Map())
+    this(master, appName, sparkHome, Nil, Map())
 
   /**
    * Alternative constructor that allows setting common Spark properties directly
@@ -199,7 +189,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
    *             system or HDFS, HTTP, HTTPS, or FTP URLs.
    */
   private[spark] def this(master: String, appName: String, sparkHome: String, jars: Seq[String]) =
-    this(master, appName, sparkHome, jars, Map(), Map())
+    this(master, appName, sparkHome, jars, Map())
 
   // log out Spark Version in Spark driver log
   logInfo(s"Running Spark version $SPARK_VERSION")
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 08e4a449cf762..0872d3f3e7093 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -100,6 +100,9 @@ object MimaExcludes {
           "org.apache.spark.sql.SQLContext.setSession"),
         ProblemFilters.exclude[MissingMethodProblem](
           "org.apache.spark.sql.SQLContext.createSession")
+      ) ++ Seq(
+        ProblemFilters.exclude[MissingMethodProblem](
+          "org.apache.spark.SparkContext.preferredNodeLocationData_=")
       )
     case v if v.startsWith("1.5") =>
       Seq(
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index 3791eea5bf178..d1d248bf79beb 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -255,7 +255,6 @@ private[spark] class ApplicationMaster(
       driverRef,
       yarnConf,
       _sparkConf,
-      if (sc != null) sc.preferredNodeLocationData else Map(),
       uiAddress,
       historyAddress,
       securityMgr)
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
index df042bf291de7..d2a211f6711ff 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala
@@ -49,7 +49,6 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
    *
    * @param conf The Yarn configuration.
    * @param sparkConf The Spark configuration.
-   * @param preferredNodeLocations Map with hints about where to allocate containers.
    * @param uiAddress Address of the SparkUI.
    * @param uiHistoryAddress Address of the application on the History Server.
    */
@@ -58,7 +57,6 @@ private[spark] class YarnRMClient(args: ApplicationMasterArguments) extends Logg
       driverRef: RpcEndpointRef,
       conf: YarnConfiguration,
       sparkConf: SparkConf,
-      preferredNodeLocations: Map[String, Set[SplitInfo]],
       uiAddress: String,
       uiHistoryAddress: String,
       securityMgr: SecurityManager

From dfa41e63b98c28b087c56f94658b5e99e8a7758c Mon Sep 17 00:00:00 2001
From: Alex Angelini <alex.louis.angelini@gmail.com>
Date: Mon, 19 Oct 2015 10:07:39 -0700
Subject: [PATCH 099/139] [SPARK-9643] Upgrade pyrolite to 4.9

Includes: https://github.com/irmen/Pyrolite/pull/23 which fixes datetimes with timezones.

JoshRosen

https://issues.apache.org/jira/browse/SPARK-9643

Author: Alex Angelini <alex.louis.angelini@gmail.com>

Closes #7950 from angelini/upgrade_pyrolite_up.
---
 core/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/pom.xml b/core/pom.xml
index c0af98a04fb1d..fdcb6a7902bbd 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -339,7 +339,7 @@
     <dependency>
       <groupId>net.razorvine</groupId>
       <artifactId>pyrolite</artifactId>
-      <version>4.4</version>
+      <version>4.9</version>
       <exclusions>
         <exclusion>
           <groupId>net.razorvine</groupId>

From 4c33a34ba3167ae67fdb4978ea2166ce65638fb9 Mon Sep 17 00:00:00 2001
From: lewuathe <lewuathe@me.com>
Date: Mon, 19 Oct 2015 10:46:10 -0700
Subject: [PATCH 100/139] =?UTF-8?q?[SPARK-10668]=20[ML]=20Use=20WeightedLe?=
 =?UTF-8?q?astSquares=20in=20LinearRegression=20with=20L=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…2 regularization if the number of features is small

Author: lewuathe <lewuathe@me.com>
Author: Lewuathe <sasaki@treasure-data.com>
Author: Kai Sasaki <sasaki@treasure-data.com>
Author: Lewuathe <lewuathe@me.com>

Closes #8884 from Lewuathe/SPARK-10668.
---
 R/pkg/R/mllib.R                               |    5 +-
 R/pkg/inst/tests/test_mllib.R                 |    2 +-
 .../ml/param/shared/SharedParamsCodeGen.scala |    4 +-
 .../spark/ml/param/shared/sharedParams.scala  |   17 +
 .../apache/spark/ml/r/SparkRWrappers.scala    |    4 +-
 .../ml/regression/LinearRegression.scala      |   50 +-
 .../regression/JavaLinearRegressionSuite.java |    3 +-
 .../ml/regression/LinearRegressionSuite.scala | 1045 +++++++++--------
 .../spark/ml/tuning/CrossValidatorSuite.scala |    2 +-
 .../ml/tuning/TrainValidationSplitSuite.scala |    2 +-
 10 files changed, 640 insertions(+), 494 deletions(-)

diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index cd00bbbeec698..25615e805e03c 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -45,11 +45,12 @@ setClass("PipelineModel", representation(model = "jobj"))
 #' summary(model)
 #'}
 setMethod("glm", signature(formula = "formula", family = "ANY", data = "DataFrame"),
-          function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0) {
+          function(formula, family = c("gaussian", "binomial"), data, lambda = 0, alpha = 0,
+            solver = "auto") {
             family <- match.arg(family)
             model <- callJStatic("org.apache.spark.ml.api.r.SparkRWrappers",
                                  "fitRModelFormula", deparse(formula), data@sdf, family, lambda,
-                                 alpha)
+                                 alpha, solver)
             return(new("PipelineModel", model = model))
           })
 
diff --git a/R/pkg/inst/tests/test_mllib.R b/R/pkg/inst/tests/test_mllib.R
index 032f8ec68b9d0..3331ce738358c 100644
--- a/R/pkg/inst/tests/test_mllib.R
+++ b/R/pkg/inst/tests/test_mllib.R
@@ -59,7 +59,7 @@ test_that("feature interaction vs native glm", {
 
 test_that("summary coefficients match with native glm", {
   training <- createDataFrame(sqlContext, iris)
-  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training))
+  stats <- summary(glm(Sepal_Width ~ Sepal_Length + Species, data = training, solver = "l-bfgs"))
   coefs <- as.vector(stats$coefficients)
   rCoefs <- as.vector(coef(glm(Sepal.Width ~ Sepal.Length + Species, data = iris)))
   expect_true(all(abs(rCoefs - coefs) < 1e-6))
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
index 8cb6b5493c61c..c7bca1243092c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -73,7 +73,9 @@ private[shared] object SharedParamsCodeGen {
       ParamDesc[Double]("tol", "the convergence tolerance for iterative algorithms"),
       ParamDesc[Double]("stepSize", "Step size to be used for each iteration of optimization."),
       ParamDesc[String]("weightCol", "weight column name. If this is not set or empty, we treat " +
-        "all instance weights as 1.0."))
+        "all instance weights as 1.0."),
+      ParamDesc[String]("solver", "the solver algorithm for optimization. If this is not set or " +
+        "empty, default value is 'auto'.", Some("\"auto\"")))
 
     val code = genSharedParams(params)
     val file = "src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala"
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
index e3625212e5251..cb2a060a34dd6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -357,4 +357,21 @@ private[ml] trait HasWeightCol extends Params {
   /** @group getParam */
   final def getWeightCol: String = $(weightCol)
 }
+
+/**
+ * Trait for shared param solver (default: "auto").
+ */
+private[ml] trait HasSolver extends Params {
+
+  /**
+   * Param for the solver algorithm for optimization. If this is not set or empty, default value is 'auto'..
+   * @group param
+   */
+  final val solver: Param[String] = new Param[String](this, "solver", "the solver algorithm for optimization. If this is not set or empty, default value is 'auto'.")
+
+  setDefault(solver, "auto")
+
+  /** @group getParam */
+  final def getSolver: String = $(solver)
+}
 // scalastyle:on
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
index f5a022c31ed90..fec61fed3cb9c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/SparkRWrappers.scala
@@ -30,13 +30,15 @@ private[r] object SparkRWrappers {
       df: DataFrame,
       family: String,
       lambda: Double,
-      alpha: Double): PipelineModel = {
+      alpha: Double,
+      solver: String): PipelineModel = {
     val formula = new RFormula().setFormula(value)
     val estimator = family match {
       case "gaussian" => new LinearRegression()
         .setRegParam(lambda)
         .setElasticNetParam(alpha)
         .setFitIntercept(formula.hasIntercept)
+        .setSolver(solver)
       case "binomial" => new LogisticRegression()
         .setRegParam(lambda)
         .setElasticNetParam(alpha)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index dd09667ef5a0f..573a61a6eabdf 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
@@ -25,6 +25,7 @@ import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS => BreezeLBFGS,
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.feature.Instance
+import org.apache.spark.ml.optim.WeightedLeastSquares
 import org.apache.spark.ml.PredictorParams
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.param.shared._
@@ -43,7 +44,7 @@ import org.apache.spark.storage.StorageLevel
  */
 private[regression] trait LinearRegressionParams extends PredictorParams
     with HasRegParam with HasElasticNetParam with HasMaxIter with HasTol
-    with HasFitIntercept with HasStandardization with HasWeightCol
+    with HasFitIntercept with HasStandardization with HasWeightCol with HasSolver
 
 /**
  * :: Experimental ::
@@ -130,9 +131,53 @@ class LinearRegression(override val uid: String)
   def setWeightCol(value: String): this.type = set(weightCol, value)
   setDefault(weightCol -> "")
 
+  /**
+   * Set the solver algorithm used for optimization.
+   * In case of linear regression, this can be "l-bfgs", "normal" and "auto".
+   * The default value is "auto" which means that the solver algorithm is
+   * selected automatically.
+   * @group setParam
+   */
+  def setSolver(value: String): this.type = set(solver, value)
+  setDefault(solver -> "auto")
+
   override protected def train(dataset: DataFrame): LinearRegressionModel = {
-    // Extract columns from data.  If dataset is persisted, do not persist instances.
+    // Extract the number of features before deciding optimization solver.
+    val numFeatures = dataset.select(col($(featuresCol))).limit(1).map {
+      case Row(features: Vector) => features.size
+    }.toArray()(0)
     val w = if ($(weightCol).isEmpty) lit(1.0) else col($(weightCol))
+
+    if (($(solver) == "auto" && $(elasticNetParam) == 0.0 && numFeatures <= 4096) ||
+      $(solver) == "normal") {
+      require($(elasticNetParam) == 0.0, "Only L2 regularization can be used when normal " +
+        "solver is used.'")
+      // For low dimensional data, WeightedLeastSquares is more efficiently since the
+      // training algorithm only requires one pass through the data. (SPARK-10668)
+      val instances: RDD[WeightedLeastSquares.Instance] = dataset.select(
+        col($(labelCol)), w, col($(featuresCol))).map {
+          case Row(label: Double, weight: Double, features: Vector) =>
+            WeightedLeastSquares.Instance(weight, features, label)
+      }
+
+      val optimizer = new WeightedLeastSquares($(fitIntercept), $(regParam),
+        $(standardization), true)
+      val model = optimizer.fit(instances)
+      // When it is trained by WeightedLeastSquares, training summary does not
+      // attached returned model.
+      val lrModel = copyValues(new LinearRegressionModel(uid, model.coefficients, model.intercept))
+      // WeightedLeastSquares does not run through iterations. So it does not generate
+      // an objective history.
+      val (summaryModel, predictionColName) = lrModel.findSummaryModelAndPredictionCol()
+      val trainingSummary = new LinearRegressionTrainingSummary(
+        summaryModel.transform(dataset),
+        predictionColName,
+        $(labelCol),
+        $(featuresCol),
+        Array(0D))
+      return lrModel.setSummary(trainingSummary)
+    }
+
     val instances: RDD[Instance] = dataset.select(col($(labelCol)), w, col($(featuresCol))).map {
       case Row(label: Double, weight: Double, features: Vector) =>
         Instance(label, weight, features)
@@ -155,7 +200,6 @@ class LinearRegression(override val uid: String)
         new MultivariateOnlineSummarizer, new MultivariateOnlineSummarizer)(seqOp, combOp)
     }
 
-    val numFeatures = featuresSummarizer.mean.size
     val yMean = ySummarizer.mean(0)
     val yStd = math.sqrt(ySummarizer.variance(0))
 
diff --git a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
index 91c589d00abd5..4fb0b0d1092b6 100644
--- a/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
+++ b/mllib/src/test/java/org/apache/spark/ml/regression/JavaLinearRegressionSuite.java
@@ -61,6 +61,7 @@ public void tearDown() {
   public void linearRegressionDefaultParams() {
     LinearRegression lr = new LinearRegression();
     assertEquals("label", lr.getLabelCol());
+    assertEquals("auto", lr.getSolver());
     LinearRegressionModel model = lr.fit(dataset);
     model.transform(dataset).registerTempTable("prediction");
     DataFrame predictions = jsql.sql("SELECT label, prediction FROM prediction");
@@ -75,7 +76,7 @@ public void linearRegressionWithSetters() {
     // Set params, train, and check as many params as we can.
     LinearRegression lr = new LinearRegression()
         .setMaxIter(10)
-        .setRegParam(1.0);
+        .setRegParam(1.0).setSolver("l-bfgs");
     LinearRegressionModel model = lr.fit(dataset);
     LinearRegression parent = (LinearRegression) model.parent();
     assertEquals(10, parent.getMaxIter());
diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 73a0a5caf8640..a6e0c72ba9030 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -31,6 +31,7 @@ import org.apache.spark.sql.{DataFrame, Row}
 
 class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
 
+  private val seed: Int = 42
   @transient var dataset: DataFrame = _
   @transient var datasetWithoutIntercept: DataFrame = _
 
@@ -50,15 +51,14 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     super.beforeAll()
     dataset = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
+        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
     /*
        datasetWithoutIntercept is not needed for correctness testing but is useful for illustrating
        training model without intercept
      */
     datasetWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
-        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2))
-
+        0.0, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 10000, seed, 0.1), 2))
   }
 
   test("params") {
@@ -76,6 +76,7 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(lir.getElasticNetParam === 0.0)
     assert(lir.getFitIntercept)
     assert(lir.getStandardization)
+    assert(lir.getSolver == "auto")
     val model = lir.fit(dataset)
 
     // copied model must have the same parent.
@@ -93,525 +94,603 @@ class LinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {
   }
 
   test("linear regression with intercept without regularization") {
-    val trainer1 = new LinearRegression
-    // The result should be the same regardless of standardization without regularization
-    val trainer2 = (new LinearRegression).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       Using the following R code to load the data and train the model using glmnet package.
-
-       library("glmnet")
-       data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
-       features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
-       label <- as.numeric(data$V1)
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         6.298698
-       as.numeric.data.V2. 4.700706
-       as.numeric.data.V3. 7.199082
-     */
-    val interceptR = 6.298698
-    val weightsR = Vectors.dense(4.700706, 7.199082)
-
-    assert(model1.intercept ~== interceptR relTol 1E-3)
-    assert(model1.weights ~= weightsR relTol 1E-3)
-    assert(model2.intercept ~== interceptR relTol 1E-3)
-    assert(model2.weights ~= weightsR relTol 1E-3)
-
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = new LinearRegression().setSolver(solver)
+      // The result should be the same regardless of standardization without regularization
+      val trainer2 = (new LinearRegression).setStandardization(false).setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val model2 = trainer2.fit(dataset)
+
+      /*
+         Using the following R code to load the data and train the model using glmnet package.
+
+         library("glmnet")
+         data <- read.csv("path", header=FALSE, stringsAsFactors=FALSE)
+         features <- as.matrix(data.frame(as.numeric(data$V2), as.numeric(data$V3)))
+         label <- as.numeric(data$V1)
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         6.298698
+         as.numeric.data.V2. 4.700706
+         as.numeric.data.V3. 7.199082
+       */
+      val interceptR = 6.298698
+      val weightsR = Vectors.dense(4.700706, 7.199082)
+
+      assert(model1.intercept ~== interceptR relTol 1E-3)
+      assert(model1.weights ~= weightsR relTol 1E-3)
+      assert(model2.intercept ~== interceptR relTol 1E-3)
+      assert(model2.weights ~= weightsR relTol 1E-3)
+
+      model1.transform(dataset).select("features", "prediction").collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
+      }
     }
   }
 
   test("linear regression without intercept without regularization") {
-    val trainer1 = (new LinearRegression).setFitIntercept(false)
-    // Without regularization the results should be the same
-    val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept)
-    val model2 = trainer2.fit(dataset)
-    val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept)
-
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
-         intercept = FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 6.995908
-       as.numeric.data.V3. 5.275131
-     */
-    val weightsR = Vectors.dense(6.995908, 5.275131)
-
-    assert(model1.intercept ~== 0 absTol 1E-3)
-    assert(model1.weights ~= weightsR relTol 1E-3)
-    assert(model2.intercept ~== 0 absTol 1E-3)
-    assert(model2.weights ~= weightsR relTol 1E-3)
-
-    /*
-       Then again with the data with no intercept:
-       > weightsWithoutIntercept
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)           .
-       as.numeric.data3.V2. 4.70011
-       as.numeric.data3.V3. 7.19943
-     */
-    val weightsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
-
-    assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
-    assert(modelWithoutIntercept1.weights ~= weightsWithoutInterceptR relTol 1E-3)
-    assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
-    assert(modelWithoutIntercept2.weights ~= weightsWithoutInterceptR relTol 1E-3)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setFitIntercept(false).setSolver(solver)
+      // Without regularization the results should be the same
+      val trainer2 = (new LinearRegression).setFitIntercept(false).setStandardization(false)
+        .setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val modelWithoutIntercept1 = trainer1.fit(datasetWithoutIntercept)
+      val model2 = trainer2.fit(dataset)
+      val modelWithoutIntercept2 = trainer2.fit(datasetWithoutIntercept)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0, lambda = 0,
+           intercept = FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.data.V2. 6.995908
+         as.numeric.data.V3. 5.275131
+       */
+      val weightsR = Vectors.dense(6.995908, 5.275131)
+
+      assert(model1.intercept ~== 0 absTol 1E-3)
+      assert(model1.weights ~= weightsR relTol 1E-3)
+      assert(model2.intercept ~== 0 absTol 1E-3)
+      assert(model2.weights ~= weightsR relTol 1E-3)
+
+      /*
+         Then again with the data with no intercept:
+         > weightsWithoutIntercept
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)           .
+         as.numeric.data3.V2. 4.70011
+         as.numeric.data3.V3. 7.19943
+       */
+      val weightsWithoutInterceptR = Vectors.dense(4.70011, 7.19943)
+
+      assert(modelWithoutIntercept1.intercept ~== 0 absTol 1E-3)
+      assert(modelWithoutIntercept1.weights ~= weightsWithoutInterceptR relTol 1E-3)
+      assert(modelWithoutIntercept2.intercept ~== 0 absTol 1E-3)
+      assert(modelWithoutIntercept2.weights ~= weightsWithoutInterceptR relTol 1E-3)
+    }
   }
 
   test("linear regression with intercept with L1 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-    val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-      .setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         6.24300
-       as.numeric.data.V2. 4.024821
-       as.numeric.data.V3. 6.679841
-     */
-    val interceptR1 = 6.24300
-    val weightsR1 = Vectors.dense(4.024821, 6.679841)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-        standardize=FALSE))
-      > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                s0
-      (Intercept)         6.416948
-      as.numeric.data.V2. 3.893869
-      as.numeric.data.V3. 6.724286
-     */
-    val interceptR2 = 6.416948
-    val weightsR2 = Vectors.dense(3.893869, 6.724286)
-
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setSolver(solver).setStandardization(false)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with only L1 regularization case.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                    s0
+           (Intercept)         6.24300
+           as.numeric.data.V2. 4.024821
+           as.numeric.data.V3. 6.679841
+         */
+        val interceptR1 = 6.24300
+        val weightsR1 = Vectors.dense(4.024821, 6.679841)
+        assert(model1.intercept ~== interceptR1 relTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
+             standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                    s0
+           (Intercept)         6.416948
+           as.numeric.data.V2. 3.893869
+           as.numeric.data.V3. 6.724286
+         */
+        val interceptR2 = 6.416948
+        val weightsR2 = Vectors.dense(3.893869, 6.724286)
+
+        assert(model2.intercept ~== interceptR2 relTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression without intercept with L1 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-      .setFitIntercept(false)
-    val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
-      .setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-         intercept=FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)          .
-       as.numeric.data.V2. 6.299752
-       as.numeric.data.V3. 4.772913
-     */
-    val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(6.299752, 4.772913)
-
-    assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
-         intercept=FALSE, standardize=FALSE))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 6.232193
-       as.numeric.data.V3. 4.764229
-     */
-    val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(6.232193, 4.764229)
-
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setFitIntercept(false).setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(1.0).setRegParam(0.57)
+        .setFitIntercept(false).setStandardization(false).setSolver(solver)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with only L1 regularization case.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
+             intercept=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)          .
+           as.numeric.data.V2. 6.299752
+           as.numeric.data.V3. 4.772913
+         */
+        val interceptR1 = 0.0
+        val weightsR1 = Vectors.dense(6.299752, 4.772913)
+
+        assert(model1.intercept ~== interceptR1 absTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 1.0, lambda = 0.57,
+             intercept=FALSE, standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         .
+           as.numeric.data.V2. 6.232193
+           as.numeric.data.V3. 4.764229
+         */
+        val interceptR2 = 0.0
+        val weightsR2 = Vectors.dense(6.232193, 4.764229)
+
+        assert(model2.intercept ~== interceptR2 absTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression with intercept with L2 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-      .setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
-      > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                s0
-      (Intercept)         5.269376
-      as.numeric.data.V2. 3.736216
-      as.numeric.data.V3. 5.712356)
-     */
-    val interceptR1 = 5.269376
-    val weightsR1 = Vectors.dense(3.736216, 5.712356)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-        standardize=FALSE))
-      > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-                                s0
-      (Intercept)         5.791109
-      as.numeric.data.V2. 3.435466
-      as.numeric.data.V3. 5.910406
-     */
-    val interceptR2 = 5.791109
-    val weightsR2 = Vectors.dense(3.435466, 5.910406)
-
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setStandardization(false).setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val model2 = trainer2.fit(dataset)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         5.269376
+         as.numeric.data.V2. 3.736216
+         as.numeric.data.V3. 5.712356)
+       */
+      val interceptR1 = 5.269376
+      val weightsR1 = Vectors.dense(3.736216, 5.712356)
+
+      assert(model1.intercept ~== interceptR1 relTol 1E-3)
+      assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+           standardize=FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         5.791109
+         as.numeric.data.V2. 3.435466
+         as.numeric.data.V3. 5.910406
+       */
+      val interceptR2 = 5.791109
+      val weightsR2 = Vectors.dense(3.435466, 5.910406)
+
+      assert(model2.intercept ~== interceptR2 relTol 1E-3)
+      assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+      model1.transform(dataset).select("features", "prediction").collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
+      }
     }
   }
 
   test("linear regression without intercept with L2 regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-      .setFitIntercept(false)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
-      .setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-         intercept = FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 5.522875
-       as.numeric.data.V3. 4.214502
-     */
-    val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(5.522875, 4.214502)
-
-    assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
-         intercept = FALSE, standardize=FALSE))
-       > weights
-        3 x 1 sparse Matrix of class "dgCMatrix"
-                                 s0
-       (Intercept)         .
-       as.numeric.data.V2. 5.263704
-       as.numeric.data.V3. 4.187419
-     */
-    val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(5.263704, 4.187419)
-
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setFitIntercept(false).setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.0).setRegParam(2.3)
+        .setFitIntercept(false).setStandardization(false).setSolver(solver)
+      val model1 = trainer1.fit(dataset)
+      val model2 = trainer2.fit(dataset)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+           intercept = FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.data.V2. 5.522875
+         as.numeric.data.V3. 4.214502
+       */
+      val interceptR1 = 0.0
+      val weightsR1 = Vectors.dense(5.522875, 4.214502)
+
+      assert(model1.intercept ~== interceptR1 absTol 1E-3)
+      assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+      /*
+         weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.0, lambda = 2.3,
+           intercept = FALSE, standardize=FALSE))
+         > weights
+          3 x 1 sparse Matrix of class "dgCMatrix"
+                                   s0
+         (Intercept)         .
+         as.numeric.data.V2. 5.263704
+         as.numeric.data.V3. 4.187419
+       */
+      val interceptR2 = 0.0
+      val weightsR2 = Vectors.dense(5.263704, 4.187419)
+
+      assert(model2.intercept ~== interceptR2 absTol 1E-3)
+      assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+      model1.transform(dataset).select("features", "prediction").collect().foreach {
+        case Row(features: DenseVector, prediction1: Double) =>
+          val prediction2 =
+            features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+          assert(prediction1 ~== prediction2 relTol 1E-5)
+      }
     }
   }
 
   test("linear regression with intercept with ElasticNet regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-      .setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-       s0
-       (Intercept)         6.324108
-       as.numeric.data.V2. 3.168435
-       as.numeric.data.V3. 5.200403
-     */
-    val interceptR1 = 5.696056
-    val weightsR1 = Vectors.dense(3.670489, 6.001122)
-
-    assert(model1.intercept ~== interceptR1 relTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-      weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
-       standardize=FALSE))
-      > weights
-      3 x 1 sparse Matrix of class "dgCMatrix"
-      s0
-      (Intercept)         6.114723
-      as.numeric.data.V2. 3.409937
-      as.numeric.data.V3. 6.146531
-     */
-    val interceptR2 = 6.114723
-    val weightsR2 = Vectors.dense(3.409937, 6.146531)
-
-    assert(model2.intercept ~== interceptR2 relTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setStandardization(false).setSolver(solver)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with non-zero elasticnet parameter.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         6.324108
+           as.numeric.data.V2. 3.168435
+           as.numeric.data.V3. 5.200403
+         */
+        val interceptR1 = 5.696056
+        val weightsR1 = Vectors.dense(3.670489, 6.001122)
+
+        assert(model1.intercept ~== interceptR1 relTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6
+             standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         6.114723
+           as.numeric.data.V2. 3.409937
+           as.numeric.data.V3. 6.146531
+         */
+        val interceptR2 = 6.114723
+        val weightsR2 = Vectors.dense(3.409937, 6.146531)
+
+        assert(model2.intercept ~== interceptR2 relTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression without intercept with ElasticNet regularization") {
-    val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-      .setFitIntercept(false)
-    val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
-      .setFitIntercept(false).setStandardization(false)
-    val model1 = trainer1.fit(dataset)
-    val model2 = trainer2.fit(dataset)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
-         intercept=FALSE))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-       s0
-       (Intercept)         .
-       as.numeric.dataM.V2. 5.673348
-       as.numeric.dataM.V3. 4.322251
-     */
-    val interceptR1 = 0.0
-    val weightsR1 = Vectors.dense(5.673348, 4.322251)
-
-    assert(model1.intercept ~== interceptR1 absTol 1E-3)
-    assert(model1.weights ~= weightsR1 relTol 1E-3)
-
-    /*
-       weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
-         intercept=FALSE, standardize=FALSE))
-       > weights
-       3 x 1 sparse Matrix of class "dgCMatrix"
-       s0
-       (Intercept)         .
-       as.numeric.data.V2. 5.477988
-       as.numeric.data.V3. 4.297622
-     */
-    val interceptR2 = 0.0
-    val weightsR2 = Vectors.dense(5.477988, 4.297622)
-
-    assert(model2.intercept ~== interceptR2 absTol 1E-3)
-    assert(model2.weights ~= weightsR2 relTol 1E-3)
-
-    model1.transform(dataset).select("features", "prediction").collect().foreach {
-      case Row(features: DenseVector, prediction1: Double) =>
-        val prediction2 =
-          features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
-        assert(prediction1 ~== prediction2 relTol 1E-5)
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer1 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setFitIntercept(false).setSolver(solver)
+      val trainer2 = (new LinearRegression).setElasticNetParam(0.3).setRegParam(1.6)
+        .setFitIntercept(false).setStandardization(false).setSolver(solver)
+
+      var model1: LinearRegressionModel = null
+      var model2: LinearRegressionModel = null
+
+      // Normal optimizer is not supported with non-zero elasticnet parameter.
+      if (solver == "normal") {
+        intercept[IllegalArgumentException] {
+            trainer1.fit(dataset)
+            trainer2.fit(dataset)
+          }
+      } else {
+        model1 = trainer1.fit(dataset)
+        model2 = trainer2.fit(dataset)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
+             intercept=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                      s0
+           (Intercept)         .
+           as.numeric.dataM.V2. 5.673348
+           as.numeric.dataM.V3. 4.322251
+         */
+        val interceptR1 = 0.0
+        val weightsR1 = Vectors.dense(5.673348, 4.322251)
+
+        assert(model1.intercept ~== interceptR1 absTol 1E-3)
+        assert(model1.weights ~= weightsR1 relTol 1E-3)
+
+        /*
+           weights <- coef(glmnet(features, label, family="gaussian", alpha = 0.3, lambda = 1.6,
+             intercept=FALSE, standardize=FALSE))
+           > weights
+            3 x 1 sparse Matrix of class "dgCMatrix"
+                                     s0
+           (Intercept)         .
+           as.numeric.data.V2. 5.477988
+           as.numeric.data.V3. 4.297622
+         */
+        val interceptR2 = 0.0
+        val weightsR2 = Vectors.dense(5.477988, 4.297622)
+
+        assert(model2.intercept ~== interceptR2 absTol 1E-3)
+        assert(model2.weights ~= weightsR2 relTol 1E-3)
+
+        model1.transform(dataset).select("features", "prediction").collect().foreach {
+          case Row(features: DenseVector, prediction1: Double) =>
+            val prediction2 =
+              features(0) * model1.weights(0) + features(1) * model1.weights(1) + model1.intercept
+            assert(prediction1 ~== prediction2 relTol 1E-5)
+        }
+      }
     }
   }
 
   test("linear regression model training summary") {
-    val trainer = new LinearRegression
-    val model = trainer.fit(dataset)
-    val trainerNoPredictionCol = trainer.setPredictionCol("")
-    val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
-
-
-    // Training results for the model should be available
-    assert(model.hasSummary)
-    assert(modelNoPredictionCol.hasSummary)
-
-    // Schema should be a superset of the input dataset
-    assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
-      model.summary.predictions.schema.fieldNames.toSet))
-    // Validate that we re-insert a prediction column for evaluation
-    val modelNoPredictionColFieldNames = modelNoPredictionCol.summary.predictions.schema.fieldNames
-    assert((dataset.schema.fieldNames.toSet).subsetOf(
-      modelNoPredictionColFieldNames.toSet))
-    assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
-
-    // Residuals in [[LinearRegressionResults]] should equal those manually computed
-    val expectedResiduals = dataset.select("features", "label")
-      .map { case Row(features: DenseVector, label: Double) =>
-      val prediction =
-        features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
-      label - prediction
-    }
-      .zip(model.summary.residuals.map(_.getDouble(0)))
-      .collect()
-      .foreach { case (manualResidual: Double, resultResidual: Double) =>
-      assert(manualResidual ~== resultResidual relTol 1E-5)
-    }
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer = new LinearRegression().setSolver(solver)
+      val model = trainer.fit(dataset)
+      val trainerNoPredictionCol = trainer.setPredictionCol("")
+      val modelNoPredictionCol = trainerNoPredictionCol.fit(dataset)
+
+
+      // Training results for the model should be available
+      assert(model.hasSummary)
+      assert(modelNoPredictionCol.hasSummary)
+
+      // Schema should be a superset of the input dataset
+      assert((dataset.schema.fieldNames.toSet + "prediction").subsetOf(
+        model.summary.predictions.schema.fieldNames.toSet))
+      // Validate that we re-insert a prediction column for evaluation
+      val modelNoPredictionColFieldNames
+      = modelNoPredictionCol.summary.predictions.schema.fieldNames
+      assert((dataset.schema.fieldNames.toSet).subsetOf(
+        modelNoPredictionColFieldNames.toSet))
+      assert(modelNoPredictionColFieldNames.exists(s => s.startsWith("prediction_")))
+
+      // Residuals in [[LinearRegressionResults]] should equal those manually computed
+      val expectedResiduals = dataset.select("features", "label")
+        .map { case Row(features: DenseVector, label: Double) =>
+        val prediction =
+          features(0) * model.weights(0) + features(1) * model.weights(1) + model.intercept
+        label - prediction
+      }
+        .zip(model.summary.residuals.map(_.getDouble(0)))
+        .collect()
+        .foreach { case (manualResidual: Double, resultResidual: Double) =>
+        assert(manualResidual ~== resultResidual relTol 1E-5)
+      }
 
-    /*
-       Use the following R code to generate model training results.
-
-       predictions <- predict(fit, newx=features)
-       residuals <- label - predictions
-       > mean(residuals^2) # MSE
-       [1] 0.009720325
-       > mean(abs(residuals)) # MAD
-       [1] 0.07863206
-       > cor(predictions, label)^2# r^2
-               [,1]
-       s0 0.9998749
-     */
-    assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
-    assert(model.summary.meanAbsoluteError ~== 0.07863206  relTol 1E-5)
-    assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
-
-    // Objective function should be monotonically decreasing for linear regression
-    assert(
-      model.summary
-        .objectiveHistory
-        .sliding(2)
-        .forall(x => x(0) >= x(1)))
+      /*
+         Use the following R code to generate model training results.
+
+         predictions <- predict(fit, newx=features)
+         residuals <- label - predictions
+         > mean(residuals^2) # MSE
+         [1] 0.009720325
+         > mean(abs(residuals)) # MAD
+         [1] 0.07863206
+         > cor(predictions, label)^2# r^2
+                 [,1]
+         s0 0.9998749
+       */
+      assert(model.summary.meanSquaredError ~== 0.00972035 relTol 1E-5)
+      assert(model.summary.meanAbsoluteError ~== 0.07863206 relTol 1E-5)
+      assert(model.summary.r2 ~== 0.9998749 relTol 1E-5)
+
+      // Normal solver uses "WeightedLeastSquares". This algorithm does not generate
+      // objective history because it does not run through iterations.
+      if (solver == "l-bfgs") {
+        // Objective function should be monotonically decreasing for linear regression
+        assert(
+          model.summary
+            .objectiveHistory
+            .sliding(2)
+            .forall(x => x(0) >= x(1)))
+      }
+    }
   }
 
   test("linear regression model testset evaluation summary") {
-    val trainer = new LinearRegression
-    val model = trainer.fit(dataset)
-
-    // Evaluating on training dataset should yield results summary equal to training summary
-    val testSummary = model.evaluate(dataset)
-    assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
-    assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
-    model.summary.residuals.select("residuals").collect()
-      .zip(testSummary.residuals.select("residuals").collect())
-      .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val trainer = new LinearRegression().setSolver(solver)
+      val model = trainer.fit(dataset)
+
+      // Evaluating on training dataset should yield results summary equal to training summary
+      val testSummary = model.evaluate(dataset)
+      assert(model.summary.meanSquaredError ~== testSummary.meanSquaredError relTol 1E-5)
+      assert(model.summary.r2 ~== testSummary.r2 relTol 1E-5)
+      model.summary.residuals.select("residuals").collect()
+        .zip(testSummary.residuals.select("residuals").collect())
+        .forall { case (Row(r1: Double), Row(r2: Double)) => r1 ~== r2 relTol 1E-5 }
+    }
   }
 
-  test("linear regression with weighted samples"){
-    val (data, weightedData) = {
-      val activeData = LinearDataGenerator.generateLinearInput(
-        6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
-
-      val rnd = new Random(8392)
-      val signedData = activeData.map { case p: LabeledPoint =>
-        (rnd.nextGaussian() > 0.0, p)
-      }
-
-      val data1 = signedData.flatMap {
-        case (true, p) => Iterator(p, p)
-        case (false, p) => Iterator(p)
-      }
-
-      val weightedSignedData = signedData.flatMap {
-        case (true, LabeledPoint(label, features)) =>
-          Iterator(
-            Instance(label, weight = 1.2, features),
-            Instance(label, weight = 0.8, features)
-          )
-        case (false, LabeledPoint(label, features)) =>
-          Iterator(
-            Instance(label, weight = 0.3, features),
-            Instance(label, weight = 0.1, features),
-            Instance(label, weight = 0.6, features)
-          )
+  test("linear regression with weighted samples") {
+    Seq("auto", "l-bfgs", "normal").foreach { solver =>
+      val (data, weightedData) = {
+        val activeData = LinearDataGenerator.generateLinearInput(
+          6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+
+        val rnd = new Random(8392)
+        val signedData = activeData.map { case p: LabeledPoint =>
+          (rnd.nextGaussian() > 0.0, p)
+        }
+
+        val data1 = signedData.flatMap {
+          case (true, p) => Iterator(p, p)
+          case (false, p) => Iterator(p)
+        }
+
+        val weightedSignedData = signedData.flatMap {
+          case (true, LabeledPoint(label, features)) =>
+            Iterator(
+              Instance(label, weight = 1.2, features),
+              Instance(label, weight = 0.8, features)
+            )
+          case (false, LabeledPoint(label, features)) =>
+            Iterator(
+              Instance(label, weight = 0.3, features),
+              Instance(label, weight = 0.1, features),
+              Instance(label, weight = 0.6, features)
+            )
+        }
+
+        val noiseData = LinearDataGenerator.generateLinearInput(
+          2, Array(1, 3), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
+        val weightedNoiseData = noiseData.map {
+          case LabeledPoint(label, features) => Instance(label, weight = 0, features)
+        }
+        val data2 = weightedSignedData ++ weightedNoiseData
+
+        (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
+          sqlContext.createDataFrame(sc.parallelize(data2, 4)))
       }
 
-      val noiseData = LinearDataGenerator.generateLinearInput(
-        2, Array(1, 3), Array(0.9, -1.3), Array(0.7, 1.2), 500, 1, 0.1)
-      val weightedNoiseData = noiseData.map {
-        case LabeledPoint(label, features) => Instance(label, weight = 0, features)
-      }
-      val data2 = weightedSignedData ++ weightedNoiseData
-
-      (sqlContext.createDataFrame(sc.parallelize(data1, 4)),
-        sqlContext.createDataFrame(sc.parallelize(data2, 4)))
+      val trainer1a = (new LinearRegression).setFitIntercept(true)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+      val trainer1b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+
+      // Normal optimizer is not supported with non-zero elasticnet parameter.
+      val model1a0 = trainer1a.fit(data)
+      val model1a1 = trainer1a.fit(weightedData)
+      val model1b = trainer1b.fit(weightedData)
+
+      assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
+      assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
+      assert(model1a0.weights ~== model1b.weights absTol 1E-3)
+      assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
+
+      val trainer2a = (new LinearRegression).setFitIntercept(true)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val trainer2b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val model2a0 = trainer2a.fit(data)
+      val model2a1 = trainer2a.fit(weightedData)
+      val model2b = trainer2b.fit(weightedData)
+      assert(model2a0.weights !~= model2a1.weights absTol 1E-3)
+      assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3)
+      assert(model2a0.weights ~== model2b.weights absTol 1E-3)
+      assert(model2a0.intercept ~== model2b.intercept absTol 1E-3)
+
+      val trainer3a = (new LinearRegression).setFitIntercept(false)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+      val trainer3b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(true).setSolver(solver)
+      val model3a0 = trainer3a.fit(data)
+      val model3a1 = trainer3a.fit(weightedData)
+      val model3b = trainer3b.fit(weightedData)
+      assert(model3a0.weights !~= model3a1.weights absTol 1E-3)
+      assert(model3a0.weights ~== model3b.weights absTol 1E-3)
+
+      val trainer4a = (new LinearRegression).setFitIntercept(false)
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val trainer4b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
+        .setElasticNetParam(0.0).setRegParam(0.21).setStandardization(false).setSolver(solver)
+      val model4a0 = trainer4a.fit(data)
+      val model4a1 = trainer4a.fit(weightedData)
+      val model4b = trainer4b.fit(weightedData)
+      assert(model4a0.weights !~= model4a1.weights absTol 1E-3)
+      assert(model4a0.weights ~== model4b.weights absTol 1E-3)
     }
-
-    val trainer1a = (new LinearRegression).setFitIntercept(true)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val trainer1b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val model1a0 = trainer1a.fit(data)
-    val model1a1 = trainer1a.fit(weightedData)
-    val model1b = trainer1b.fit(weightedData)
-    assert(model1a0.weights !~= model1a1.weights absTol 1E-3)
-    assert(model1a0.intercept !~= model1a1.intercept absTol 1E-3)
-    assert(model1a0.weights ~== model1b.weights absTol 1E-3)
-    assert(model1a0.intercept ~== model1b.intercept absTol 1E-3)
-
-    val trainer2a = (new LinearRegression).setFitIntercept(true)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val trainer2b = (new LinearRegression).setFitIntercept(true).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val model2a0 = trainer2a.fit(data)
-    val model2a1 = trainer2a.fit(weightedData)
-    val model2b = trainer2b.fit(weightedData)
-    assert(model2a0.weights !~= model2a1.weights absTol 1E-3)
-    assert(model2a0.intercept !~= model2a1.intercept absTol 1E-3)
-    assert(model2a0.weights ~== model2b.weights absTol 1E-3)
-    assert(model2a0.intercept ~== model2b.intercept absTol 1E-3)
-
-    val trainer3a = (new LinearRegression).setFitIntercept(false)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val trainer3b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true)
-    val model3a0 = trainer3a.fit(data)
-    val model3a1 = trainer3a.fit(weightedData)
-    val model3b = trainer3b.fit(weightedData)
-    assert(model3a0.weights !~= model3a1.weights absTol 1E-3)
-    assert(model3a0.weights ~== model3b.weights absTol 1E-3)
-
-    val trainer4a = (new LinearRegression).setFitIntercept(false)
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val trainer4b = (new LinearRegression).setFitIntercept(false).setWeightCol("weight")
-      .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false)
-    val model4a0 = trainer4a.fit(data)
-    val model4a1 = trainer4a.fit(weightedData)
-    val model4b = trainer4b.fit(weightedData)
-    assert(model4a0.weights !~= model4a1.weights absTol 1E-3)
-    assert(model4a0.weights ~== model4b.weights absTol 1E-3)
   }
 }
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
index fde02e0c84bc0..cbe09292a0337 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/CrossValidatorSuite.scala
@@ -69,7 +69,7 @@ class CrossValidatorSuite extends SparkFunSuite with MLlibTestSparkContext {
       sc.parallelize(LinearDataGenerator.generateLinearInput(
         6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
 
-    val trainer = new LinearRegression
+    val trainer = new LinearRegression().setSolver("l-bfgs")
     val lrParamMaps = new ParamGridBuilder()
       .addGrid(trainer.regParam, Array(1000.0, 0.001))
       .addGrid(trainer.maxIter, Array(0, 10))
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
index ef24e6fb6b80f..5fb80091d0b4b 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/tuning/TrainValidationSplitSuite.scala
@@ -58,7 +58,7 @@ class TrainValidationSplitSuite extends SparkFunSuite with MLlibTestSparkContext
         sc.parallelize(LinearDataGenerator.generateLinearInput(
             6.3, Array(4.7, 7.2), Array(0.9, -1.3), Array(0.7, 1.2), 100, 42, 0.1), 2))
 
-    val trainer = new LinearRegression
+    val trainer = new LinearRegression().setSolver("l-bfgs")
     val lrParamMaps = new ParamGridBuilder()
       .addGrid(trainer.regParam, Array(1000.0, 0.001))
       .addGrid(trainer.maxIter, Array(0, 10))

From 7893cd95db5f2caba59ff5c859d7e4964ad7938d Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 19 Oct 2015 11:02:26 -0700
Subject: [PATCH 101/139] [SPARK-11119] [SQL] cleanup for unsafe array and map

The purpose of this PR is to keep the unsafe format detail only inside the unsafe class itself, so when we use them(like use unsafe array in unsafe map, use unsafe array and map in columnar cache), we don't need to understand the format before use them.

change list:
* unsafe array's 4-bytes numElements header is now required(was optional), and become a part of unsafe array format.
* w.r.t the previous changing, the `sizeInBytes` of unsafe array now counts the 4-bytes header.
* unsafe map's format was `[numElements] [key array numBytes] [key array content(without numElements header)] [value array content(without numElements header)]` before, which is a little hacky as it makes unsafe array's header optional. I think saving 4 bytes is not a big deal, so the format is now: `[key array numBytes] [unsafe key array] [unsafe value array]`.
* w.r.t the previous changing, the `sizeInBytes` of unsafe map now counts both map's header and array's header.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9131 from cloud-fan/unsafe.
---
 .../catalyst/expressions/UnsafeArrayData.java | 43 +++++----
 .../catalyst/expressions/UnsafeMapData.java   | 88 +++++++++++++++----
 .../catalyst/expressions/UnsafeReaders.java   | 54 ------------
 .../sql/catalyst/expressions/UnsafeRow.java   |  8 +-
 .../codegen/UnsafeArrayWriter.java            | 24 ++---
 .../expressions/codegen/UnsafeRowWriter.java  | 15 ----
 .../codegen/GenerateUnsafeProjection.scala    | 60 +++++++------
 .../expressions/UnsafeRowConverterSuite.scala | 42 ++++-----
 .../spark/sql/columnar/ColumnType.scala       | 30 ++++---
 .../spark/sql/columnar/ColumnTypeSuite.scala  |  2 +-
 10 files changed, 174 insertions(+), 192 deletions(-)
 delete mode 100644 sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
index 4c63abb071e3b..761f0447943e8 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java
@@ -30,19 +30,18 @@
 /**
  * An Unsafe implementation of Array which is backed by raw memory instead of Java objects.
  *
- * Each tuple has two parts: [offsets] [values]
+ * Each tuple has three parts: [numElements] [offsets] [values]
  *
- * In the `offsets` region, we store 4 bytes per element, represents the start address of this
- * element in `values` region. We can get the length of this element by subtracting next offset.
+ * The `numElements` is 4 bytes storing the number of elements of this array.
+ *
+ * In the `offsets` region, we store 4 bytes per element, represents the relative offset (w.r.t. the
+ * base address of the array) of this element in `values` region. We can get the length of this
+ * element by subtracting next offset.
  * Note that offset can by negative which means this element is null.
  *
  * In the `values` region, we store the content of elements. As we can get length info, so elements
  * can be variable-length.
  *
- * Note that when we write out this array, we should write out the `numElements` at first 4 bytes,
- * then follows content. When we read in an array, we should read first 4 bytes as `numElements`
- * and take the rest as content.
- *
  * Instances of `UnsafeArrayData` act as pointers to row data stored in this format.
  */
 // todo: there is a lof of duplicated code between UnsafeRow and UnsafeArrayData.
@@ -54,11 +53,16 @@ public class UnsafeArrayData extends ArrayData {
   // The number of elements in this array
   private int numElements;
 
-  // The size of this array's backing data, in bytes
+  // The size of this array's backing data, in bytes.
+  // The 4-bytes header of `numElements` is also included.
   private int sizeInBytes;
 
+  public Object getBaseObject() { return baseObject; }
+  public long getBaseOffset() { return baseOffset; }
+  public int getSizeInBytes() { return sizeInBytes; }
+
   private int getElementOffset(int ordinal) {
-    return Platform.getInt(baseObject, baseOffset + ordinal * 4L);
+    return Platform.getInt(baseObject, baseOffset + 4 + ordinal * 4L);
   }
 
   private int getElementSize(int offset, int ordinal) {
@@ -85,10 +89,6 @@ public Object[] array() {
    */
   public UnsafeArrayData() { }
 
-  public Object getBaseObject() { return baseObject; }
-  public long getBaseOffset() { return baseOffset; }
-  public int getSizeInBytes() { return sizeInBytes; }
-
   @Override
   public int numElements() { return numElements; }
 
@@ -97,10 +97,13 @@ public UnsafeArrayData() { }
    *
    * @param baseObject the base object
    * @param baseOffset the offset within the base object
-   * @param sizeInBytes the size of this row's backing data, in bytes
+   * @param sizeInBytes the size of this array's backing data, in bytes
    */
-  public void pointTo(Object baseObject, long baseOffset, int numElements, int sizeInBytes) {
+  public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
+    // Read the number of elements from the first 4 bytes.
+    final int numElements = Platform.getInt(baseObject, baseOffset);
     assert numElements >= 0 : "numElements (" + numElements + ") should >= 0";
+
     this.numElements = numElements;
     this.baseObject = baseObject;
     this.baseOffset = baseOffset;
@@ -277,7 +280,9 @@ public UnsafeArrayData getArray(int ordinal) {
     final int offset = getElementOffset(ordinal);
     if (offset < 0) return null;
     final int size = getElementSize(offset, ordinal);
-    return UnsafeReaders.readArray(baseObject, baseOffset + offset, size);
+    final UnsafeArrayData array = new UnsafeArrayData();
+    array.pointTo(baseObject, baseOffset + offset, size);
+    return array;
   }
 
   @Override
@@ -286,7 +291,9 @@ public UnsafeMapData getMap(int ordinal) {
     final int offset = getElementOffset(ordinal);
     if (offset < 0) return null;
     final int size = getElementSize(offset, ordinal);
-    return UnsafeReaders.readMap(baseObject, baseOffset + offset, size);
+    final UnsafeMapData map = new UnsafeMapData();
+    map.pointTo(baseObject, baseOffset + offset, size);
+    return map;
   }
 
   @Override
@@ -328,7 +335,7 @@ public UnsafeArrayData copy() {
     final byte[] arrayDataCopy = new byte[sizeInBytes];
     Platform.copyMemory(
       baseObject, baseOffset, arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
-    arrayCopy.pointTo(arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, numElements, sizeInBytes);
+    arrayCopy.pointTo(arrayDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
     return arrayCopy;
   }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
index e9dab9edb6bd1..5bebe2a96e391 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeMapData.java
@@ -17,41 +17,73 @@
 
 package org.apache.spark.sql.catalyst.expressions;
 
+import java.nio.ByteBuffer;
+
 import org.apache.spark.sql.types.MapData;
+import org.apache.spark.unsafe.Platform;
 
 /**
  * An Unsafe implementation of Map which is backed by raw memory instead of Java objects.
  *
- * Currently we just use 2 UnsafeArrayData to represent UnsafeMapData.
- *
- * Note that when we write out this map, we should write out the `numElements` at first 4 bytes,
- * and numBytes of key array at second 4 bytes, then follows key array content and value array
- * content without `numElements` header.
- * When we read in a map, we should read first 4 bytes as `numElements` and second 4 bytes as
- * numBytes of key array, and construct unsafe key array and value array with these 2 information.
+ * Currently we just use 2 UnsafeArrayData to represent UnsafeMapData, with extra 4 bytes at head
+ * to indicate the number of bytes of the unsafe key array.
+ * [unsafe key array numBytes] [unsafe key array] [unsafe value array]
  */
+// TODO: Use a more efficient format which doesn't depend on unsafe array.
 public class UnsafeMapData extends MapData {
 
-  private final UnsafeArrayData keys;
-  private final UnsafeArrayData values;
-  // The number of elements in this array
-  private int numElements;
-  // The size of this array's backing data, in bytes
+  private Object baseObject;
+  private long baseOffset;
+
+  // The size of this map's backing data, in bytes.
+  // The 4-bytes header of key array `numBytes` is also included, so it's actually equal to
+  // 4 + key array numBytes + value array numBytes.
   private int sizeInBytes;
 
+  public Object getBaseObject() { return baseObject; }
+  public long getBaseOffset() { return baseOffset; }
   public int getSizeInBytes() { return sizeInBytes; }
 
-  public UnsafeMapData(UnsafeArrayData keys, UnsafeArrayData values) {
+  private final UnsafeArrayData keys;
+  private final UnsafeArrayData values;
+
+  /**
+   * Construct a new UnsafeMapData. The resulting UnsafeMapData won't be usable until
+   * `pointTo()` has been called, since the value returned by this constructor is equivalent
+   * to a null pointer.
+   */
+  public UnsafeMapData() {
+    keys = new UnsafeArrayData();
+    values = new UnsafeArrayData();
+  }
+
+  /**
+   * Update this UnsafeMapData to point to different backing data.
+   *
+   * @param baseObject the base object
+   * @param baseOffset the offset within the base object
+   * @param sizeInBytes the size of this map's backing data, in bytes
+   */
+  public void pointTo(Object baseObject, long baseOffset, int sizeInBytes) {
+    // Read the numBytes of key array from the first 4 bytes.
+    final int keyArraySize = Platform.getInt(baseObject, baseOffset);
+    final int valueArraySize = sizeInBytes - keyArraySize - 4;
+    assert keyArraySize >= 0 : "keyArraySize (" + keyArraySize + ") should >= 0";
+    assert valueArraySize >= 0 : "valueArraySize (" + valueArraySize + ") should >= 0";
+
+    keys.pointTo(baseObject, baseOffset + 4, keyArraySize);
+    values.pointTo(baseObject, baseOffset + 4 + keyArraySize, valueArraySize);
+
     assert keys.numElements() == values.numElements();
-    this.sizeInBytes = keys.getSizeInBytes() + values.getSizeInBytes();
-    this.numElements = keys.numElements();
-    this.keys = keys;
-    this.values = values;
+
+    this.baseObject = baseObject;
+    this.baseOffset = baseOffset;
+    this.sizeInBytes = sizeInBytes;
   }
 
   @Override
   public int numElements() {
-    return numElements;
+    return keys.numElements();
   }
 
   @Override
@@ -64,8 +96,26 @@ public UnsafeArrayData valueArray() {
     return values;
   }
 
+  public void writeToMemory(Object target, long targetOffset) {
+    Platform.copyMemory(baseObject, baseOffset, target, targetOffset, sizeInBytes);
+  }
+
+  public void writeTo(ByteBuffer buffer) {
+    assert(buffer.hasArray());
+    byte[] target = buffer.array();
+    int offset = buffer.arrayOffset();
+    int pos = buffer.position();
+    writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+    buffer.position(pos + sizeInBytes);
+  }
+
   @Override
   public UnsafeMapData copy() {
-    return new UnsafeMapData(keys.copy(), values.copy());
+    UnsafeMapData mapCopy = new UnsafeMapData();
+    final byte[] mapDataCopy = new byte[sizeInBytes];
+    Platform.copyMemory(
+      baseObject, baseOffset, mapDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
+    mapCopy.pointTo(mapDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes);
+    return mapCopy;
   }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java
deleted file mode 100644
index 6c5fcbca63fd7..0000000000000
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeReaders.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.expressions;
-
-import org.apache.spark.unsafe.Platform;
-
-public class UnsafeReaders {
-
-  /**
-   * Reads in unsafe array according to the format described in `UnsafeArrayData`.
-   */
-  public static UnsafeArrayData readArray(Object baseObject, long baseOffset, int numBytes) {
-    // Read the number of elements from first 4 bytes.
-    final int numElements = Platform.getInt(baseObject, baseOffset);
-    final UnsafeArrayData array = new UnsafeArrayData();
-    // Skip the first 4 bytes.
-    array.pointTo(baseObject, baseOffset + 4, numElements, numBytes - 4);
-    return array;
-  }
-
-  /**
-   * Reads in unsafe map according to the format described in `UnsafeMapData`.
-   */
-  public static UnsafeMapData readMap(Object baseObject, long baseOffset, int numBytes) {
-    // Read the number of elements from first 4 bytes.
-    final int numElements = Platform.getInt(baseObject, baseOffset);
-    // Read the numBytes of key array in second 4 bytes.
-    final int keyArraySize = Platform.getInt(baseObject, baseOffset + 4);
-    final int valueArraySize = numBytes - 8 - keyArraySize;
-
-    final UnsafeArrayData keyArray = new UnsafeArrayData();
-    keyArray.pointTo(baseObject, baseOffset + 8, numElements, keyArraySize);
-
-    final UnsafeArrayData valueArray = new UnsafeArrayData();
-    valueArray.pointTo(baseObject, baseOffset + 8 + keyArraySize, numElements, valueArraySize);
-
-    return new UnsafeMapData(keyArray, valueArray);
-  }
-}
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
index 36859fbab9744..366615f6fe69f 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -461,7 +461,9 @@ public UnsafeArrayData getArray(int ordinal) {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
       final int size = (int) (offsetAndSize & ((1L << 32) - 1));
-      return UnsafeReaders.readArray(baseObject, baseOffset + offset, size);
+      final UnsafeArrayData array = new UnsafeArrayData();
+      array.pointTo(baseObject, baseOffset + offset, size);
+      return array;
     }
   }
 
@@ -473,7 +475,9 @@ public UnsafeMapData getMap(int ordinal) {
       final long offsetAndSize = getLong(ordinal);
       final int offset = (int) (offsetAndSize >> 32);
       final int size = (int) (offsetAndSize & ((1L << 32) - 1));
-      return UnsafeReaders.readMap(baseObject, baseOffset + offset, size);
+      final UnsafeMapData map = new UnsafeMapData();
+      map.pointTo(baseObject, baseOffset + offset, size);
+      return map;
     }
   }
 
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
index 138178ce99d85..7f2a1cb07af01 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeArrayWriter.java
@@ -30,17 +30,19 @@
 public class UnsafeArrayWriter {
 
   private BufferHolder holder;
+
   // The offset of the global buffer where we start to write this array.
   private int startingOffset;
 
   public void initialize(BufferHolder holder, int numElements, int fixedElementSize) {
-    // We need 4 bytes each element to store offset.
-    final int fixedSize = 4 * numElements;
+    // We need 4 bytes to store numElements and 4 bytes each element to store offset.
+    final int fixedSize = 4 + 4 * numElements;
 
     this.holder = holder;
     this.startingOffset = holder.cursor;
 
     holder.grow(fixedSize);
+    Platform.putInt(holder.buffer, holder.cursor, numElements);
     holder.cursor += fixedSize;
 
     // Grows the global buffer ahead for fixed size data.
@@ -48,7 +50,7 @@ public void initialize(BufferHolder holder, int numElements, int fixedElementSiz
   }
 
   private long getElementOffset(int ordinal) {
-    return startingOffset + 4 * ordinal;
+    return startingOffset + 4 + 4 * ordinal;
   }
 
   public void setNullAt(int ordinal) {
@@ -132,20 +134,4 @@ public void write(int ordinal, CalendarInterval input) {
     // move the cursor forward.
     holder.cursor += 16;
   }
-
-
-
-  // If this array is already an UnsafeArray, we don't need to go through all elements, we can
-  // directly write it.
-  public static void directWrite(BufferHolder holder, UnsafeArrayData input) {
-    final int numBytes = input.getSizeInBytes();
-
-    // grow the global buffer before writing data.
-    holder.grow(numBytes);
-
-    // Writes the array content to the variable length portion.
-    input.writeToMemory(holder.buffer, holder.cursor);
-
-    holder.cursor += numBytes;
-  }
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
index 8b7debd440031..e1f5a05d1d446 100644
--- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
+++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/codegen/UnsafeRowWriter.java
@@ -181,19 +181,4 @@ public void write(int ordinal, CalendarInterval input) {
     // move the cursor forward.
     holder.cursor += 16;
   }
-
-
-
-  // If this struct is already an UnsafeRow, we don't need to go through all fields, we can
-  // directly write it.
-  public static void directWrite(BufferHolder holder, UnsafeRow input) {
-    // No need to zero-out the bytes as UnsafeRow is word aligned for sure.
-    final int numBytes = input.getSizeInBytes();
-    // grow the global buffer before writing data.
-    holder.grow(numBytes);
-    // Write the bytes to the variable length portion.
-    input.writeToMemory(holder.buffer, holder.cursor);
-    // move the cursor forward.
-    holder.cursor += numBytes;
-  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
index 1b957a508d10e..dbe92d6a83502 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/GenerateUnsafeProjection.scala
@@ -62,7 +62,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     s"""
       if ($input instanceof UnsafeRow) {
-        $rowWriterClass.directWrite($bufferHolder, (UnsafeRow) $input);
+        ${writeUnsafeData(ctx, s"((UnsafeRow) $input)", bufferHolder)}
       } else {
         ${writeExpressionsToBuffer(ctx, input, fieldEvals, fieldTypes, bufferHolder)}
       }
@@ -164,8 +164,7 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       ctx: CodeGenContext,
       input: String,
       elementType: DataType,
-      bufferHolder: String,
-      needHeader: Boolean = true): String = {
+      bufferHolder: String): String = {
     val arrayWriter = ctx.freshName("arrayWriter")
     ctx.addMutableState(arrayWriterClass, arrayWriter,
       s"this.$arrayWriter = new $arrayWriterClass();")
@@ -227,21 +226,11 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
       case _ => s"$arrayWriter.write($index, $element);"
     }
 
-    val writeHeader = if (needHeader) {
-      // If header is required, we need to write the number of elements into first 4 bytes.
-      s"""
-        $bufferHolder.grow(4);
-        Platform.putInt($bufferHolder.buffer, $bufferHolder.cursor, $numElements);
-        $bufferHolder.cursor += 4;
-      """
-    } else ""
-
     s"""
-      final int $numElements = $input.numElements();
-      $writeHeader
       if ($input instanceof UnsafeArrayData) {
-        $arrayWriterClass.directWrite($bufferHolder, (UnsafeArrayData) $input);
+        ${writeUnsafeData(ctx, s"((UnsafeArrayData) $input)", bufferHolder)}
       } else {
+        final int $numElements = $input.numElements();
         $arrayWriter.initialize($bufferHolder, $numElements, $fixedElementSize);
 
         for (int $index = 0; $index < $numElements; $index++) {
@@ -270,23 +259,40 @@ object GenerateUnsafeProjection extends CodeGenerator[Seq[Expression], UnsafePro
 
     // Writes out unsafe map according to the format described in `UnsafeMapData`.
     s"""
-      final ArrayData $keys = $input.keyArray();
-      final ArrayData $values = $input.valueArray();
+      if ($input instanceof UnsafeMapData) {
+        ${writeUnsafeData(ctx, s"((UnsafeMapData) $input)", bufferHolder)}
+      } else {
+        final ArrayData $keys = $input.keyArray();
+        final ArrayData $values = $input.valueArray();
 
-      $bufferHolder.grow(8);
+        // preserve 4 bytes to write the key array numBytes later.
+        $bufferHolder.grow(4);
+        $bufferHolder.cursor += 4;
 
-      // Write the numElements into first 4 bytes.
-      Platform.putInt($bufferHolder.buffer, $bufferHolder.cursor, $keys.numElements());
+        // Remember the current cursor so that we can write numBytes of key array later.
+        final int $tmpCursor = $bufferHolder.cursor;
 
-      $bufferHolder.cursor += 8;
-      // Remember the current cursor so that we can write numBytes of key array later.
-      final int $tmpCursor = $bufferHolder.cursor;
+        ${writeArrayToBuffer(ctx, keys, keyType, bufferHolder)}
+        // Write the numBytes of key array into the first 4 bytes.
+        Platform.putInt($bufferHolder.buffer, $tmpCursor - 4, $bufferHolder.cursor - $tmpCursor);
 
-      ${writeArrayToBuffer(ctx, keys, keyType, bufferHolder, needHeader = false)}
-      // Write the numBytes of key array into second 4 bytes.
-      Platform.putInt($bufferHolder.buffer, $tmpCursor - 4, $bufferHolder.cursor - $tmpCursor);
+        ${writeArrayToBuffer(ctx, values, valueType, bufferHolder)}
+      }
+    """
+  }
 
-      ${writeArrayToBuffer(ctx, values, valueType, bufferHolder, needHeader = false)}
+  /**
+   * If the input is already in unsafe format, we don't need to go through all elements/fields,
+   * we can directly write it.
+   */
+  private def writeUnsafeData(ctx: CodeGenContext, input: String, bufferHolder: String) = {
+    val sizeInBytes = ctx.freshName("sizeInBytes")
+    s"""
+      final int $sizeInBytes = $input.getSizeInBytes();
+      // grow the global buffer before writing data.
+      $bufferHolder.grow($sizeInBytes);
+      $input.writeToMemory($bufferHolder.buffer, $bufferHolder.cursor);
+      $bufferHolder.cursor += $sizeInBytes;
     """
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
index c991cd86d28c8..c6aad34e972b5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/UnsafeRowConverterSuite.scala
@@ -296,13 +296,9 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     new ArrayBasedMapData(createArray(keys: _*), createArray(values: _*))
   }
 
-  private def arraySizeInRow(numBytes: Int): Int = roundedSize(4 + numBytes)
-
-  private def mapSizeInRow(numBytes: Int): Int = roundedSize(8 + numBytes)
-
   private def testArrayInt(array: UnsafeArrayData, values: Seq[Int]): Unit = {
     assert(array.numElements == values.length)
-    assert(array.getSizeInBytes == (4 + 4) * values.length)
+    assert(array.getSizeInBytes == 4 + (4 + 4) * values.length)
     values.zipWithIndex.foreach {
       case (value, index) => assert(array.getInt(index) == value)
     }
@@ -315,7 +311,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     testArrayInt(map.keyArray, keys)
     testArrayInt(map.valueArray, values)
 
-    assert(map.getSizeInBytes == map.keyArray.getSizeInBytes + map.valueArray.getSizeInBytes)
+    assert(map.getSizeInBytes == 4 + map.keyArray.getSizeInBytes + map.valueArray.getSizeInBytes)
   }
 
   test("basic conversion with array type") {
@@ -341,10 +337,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val nestedArray = unsafeArray2.getArray(0)
     testArrayInt(nestedArray, Seq(3, 4))
 
-    assert(unsafeArray2.getSizeInBytes == 4 + (4 + nestedArray.getSizeInBytes))
+    assert(unsafeArray2.getSizeInBytes == 4 + 4 + nestedArray.getSizeInBytes)
 
-    val array1Size = arraySizeInRow(unsafeArray1.getSizeInBytes)
-    val array2Size = arraySizeInRow(unsafeArray2.getSizeInBytes)
+    val array1Size = roundedSize(unsafeArray1.getSizeInBytes)
+    val array2Size = roundedSize(unsafeArray2.getSizeInBytes)
     assert(unsafeRow.getSizeInBytes == 8 + 8 * 2 + array1Size + array2Size)
   }
 
@@ -384,13 +380,13 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       val nestedMap = valueArray.getMap(0)
       testMapInt(nestedMap, Seq(5, 6), Seq(7, 8))
 
-      assert(valueArray.getSizeInBytes == 4 + (8 + nestedMap.getSizeInBytes))
+      assert(valueArray.getSizeInBytes == 4 + 4 + nestedMap.getSizeInBytes)
     }
 
-    assert(unsafeMap2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    assert(unsafeMap2.getSizeInBytes == 4 + keyArray.getSizeInBytes + valueArray.getSizeInBytes)
 
-    val map1Size = mapSizeInRow(unsafeMap1.getSizeInBytes)
-    val map2Size = mapSizeInRow(unsafeMap2.getSizeInBytes)
+    val map1Size = roundedSize(unsafeMap1.getSizeInBytes)
+    val map2Size = roundedSize(unsafeMap2.getSizeInBytes)
     assert(unsafeRow.getSizeInBytes == 8 + 8 * 2 + map1Size + map2Size)
   }
 
@@ -414,7 +410,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val innerArray = field1.getArray(0)
     testArrayInt(innerArray, Seq(1))
 
-    assert(field1.getSizeInBytes == 8 + 8 + arraySizeInRow(innerArray.getSizeInBytes))
+    assert(field1.getSizeInBytes == 8 + 8 + roundedSize(innerArray.getSizeInBytes))
 
     val field2 = unsafeRow.getArray(1)
     assert(field2.numElements == 1)
@@ -427,10 +423,10 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       assert(innerStruct.getLong(0) == 2L)
     }
 
-    assert(field2.getSizeInBytes == 4 + innerStruct.getSizeInBytes)
+    assert(field2.getSizeInBytes == 4 + 4 + innerStruct.getSizeInBytes)
 
     assert(unsafeRow.getSizeInBytes ==
-      8 + 8 * 2 + field1.getSizeInBytes + arraySizeInRow(field2.getSizeInBytes))
+      8 + 8 * 2 + field1.getSizeInBytes + roundedSize(field2.getSizeInBytes))
   }
 
   test("basic conversion with struct and map") {
@@ -453,7 +449,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val innerMap = field1.getMap(0)
     testMapInt(innerMap, Seq(1), Seq(2))
 
-    assert(field1.getSizeInBytes == 8 + 8 + mapSizeInRow(innerMap.getSizeInBytes))
+    assert(field1.getSizeInBytes == 8 + 8 + roundedSize(innerMap.getSizeInBytes))
 
     val field2 = unsafeRow.getMap(1)
 
@@ -470,13 +466,13 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       assert(innerStruct.getSizeInBytes == 8 + 8)
       assert(innerStruct.getLong(0) == 4L)
 
-      assert(valueArray.getSizeInBytes == 4 + innerStruct.getSizeInBytes)
+      assert(valueArray.getSizeInBytes == 4 + 4 + innerStruct.getSizeInBytes)
     }
 
-    assert(field2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    assert(field2.getSizeInBytes == 4 + keyArray.getSizeInBytes + valueArray.getSizeInBytes)
 
     assert(unsafeRow.getSizeInBytes ==
-      8 + 8 * 2 + field1.getSizeInBytes + mapSizeInRow(field2.getSizeInBytes))
+      8 + 8 * 2 + field1.getSizeInBytes + roundedSize(field2.getSizeInBytes))
   }
 
   test("basic conversion with array and map") {
@@ -499,7 +495,7 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
     val innerMap = field1.getMap(0)
     testMapInt(innerMap, Seq(1), Seq(2))
 
-    assert(field1.getSizeInBytes == 4 + (8 + innerMap.getSizeInBytes))
+    assert(field1.getSizeInBytes == 4 + 4 + innerMap.getSizeInBytes)
 
     val field2 = unsafeRow.getMap(1)
     assert(field2.numElements == 1)
@@ -518,9 +514,9 @@ class UnsafeRowConverterSuite extends SparkFunSuite with Matchers {
       assert(valueArray.getSizeInBytes == 4 + (4 + innerArray.getSizeInBytes))
     }
 
-    assert(field2.getSizeInBytes == keyArray.getSizeInBytes + valueArray.getSizeInBytes)
+    assert(field2.getSizeInBytes == 4 + keyArray.getSizeInBytes + valueArray.getSizeInBytes)
 
     assert(unsafeRow.getSizeInBytes ==
-      8 + 8 * 2 + arraySizeInRow(field1.getSizeInBytes) + mapSizeInRow(field2.getSizeInBytes))
+      8 + 8 * 2 + roundedSize(field1.getSizeInBytes) + roundedSize(field2.getSizeInBytes))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index 2bc2c96b61634..a41f04dd3b59a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -482,12 +482,14 @@ private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRo
   override def extract(buffer: ByteBuffer): UnsafeRow = {
     val sizeInBytes = buffer.getInt()
     assert(buffer.hasArray)
-    val base = buffer.array()
-    val offset = buffer.arrayOffset()
     val cursor = buffer.position()
     buffer.position(cursor + sizeInBytes)
     val unsafeRow = new UnsafeRow
-    unsafeRow.pointTo(base, Platform.BYTE_ARRAY_OFFSET + offset + cursor, numOfFields, sizeInBytes)
+    unsafeRow.pointTo(
+      buffer.array(),
+      Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
+      numOfFields,
+      sizeInBytes)
     unsafeRow
   }
 
@@ -508,12 +510,11 @@ private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArra
 
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
     val unsafeArray = getField(row, ordinal)
-    4 + 4 + unsafeArray.getSizeInBytes
+    4 + unsafeArray.getSizeInBytes
   }
 
   override def append(value: UnsafeArrayData, buffer: ByteBuffer): Unit = {
-    buffer.putInt(4 + value.getSizeInBytes)
-    buffer.putInt(value.numElements())
+    buffer.putInt(value.getSizeInBytes)
     value.writeTo(buffer)
   }
 
@@ -522,10 +523,12 @@ private[sql] case class ARRAY(dataType: ArrayType) extends ColumnType[UnsafeArra
     assert(buffer.hasArray)
     val cursor = buffer.position()
     buffer.position(cursor + numBytes)
-    UnsafeReaders.readArray(
+    val array = new UnsafeArrayData
+    array.pointTo(
       buffer.array(),
       Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
       numBytes)
+    array
   }
 
   override def clone(v: UnsafeArrayData): UnsafeArrayData = v.copy()
@@ -545,15 +548,12 @@ private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData]
 
   override def actualSize(row: InternalRow, ordinal: Int): Int = {
     val unsafeMap = getField(row, ordinal)
-    12 + unsafeMap.keyArray().getSizeInBytes + unsafeMap.valueArray().getSizeInBytes
+    4 + unsafeMap.getSizeInBytes
   }
 
   override def append(value: UnsafeMapData, buffer: ByteBuffer): Unit = {
-    buffer.putInt(8 + value.keyArray().getSizeInBytes + value.valueArray().getSizeInBytes)
-    buffer.putInt(value.numElements())
-    buffer.putInt(value.keyArray().getSizeInBytes)
-    value.keyArray().writeTo(buffer)
-    value.valueArray().writeTo(buffer)
+    buffer.putInt(value.getSizeInBytes)
+    value.writeTo(buffer)
   }
 
   override def extract(buffer: ByteBuffer): UnsafeMapData = {
@@ -561,10 +561,12 @@ private[sql] case class MAP(dataType: MapType) extends ColumnType[UnsafeMapData]
     assert(buffer.hasArray)
     val cursor = buffer.position()
     buffer.position(cursor + numBytes)
-    UnsafeReaders.readMap(
+    val map = new UnsafeMapData
+    map.pointTo(
       buffer.array(),
       Platform.BYTE_ARRAY_OFFSET + buffer.arrayOffset() + cursor,
       numBytes)
+    map
   }
 
   override def clone(v: UnsafeMapData): UnsafeMapData = v.copy()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
index 0e6e1bcf72896..63bc39bfa0307 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/columnar/ColumnTypeSuite.scala
@@ -73,7 +73,7 @@ class ColumnTypeSuite extends SparkFunSuite with Logging {
     checkActualSize(COMPACT_DECIMAL(15, 10), Decimal(0, 15, 10), 8)
     checkActualSize(LARGE_DECIMAL(20, 10), Decimal(0, 20, 10), 5)
     checkActualSize(ARRAY_TYPE, Array[Any](1), 16)
-    checkActualSize(MAP_TYPE, Map(1 -> "a"), 25)
+    checkActualSize(MAP_TYPE, Map(1 -> "a"), 29)
     checkActualSize(STRUCT_TYPE, Row("hello"), 28)
   }
 

From 5966817941b57251fbd1cf8b9b458ec389c071a0 Mon Sep 17 00:00:00 2001
From: Rishabh Bhardwaj <rbnext29@gmail.com>
Date: Mon, 19 Oct 2015 14:38:49 -0700
Subject: [PATCH 102/139] [SPARK-11180][SQL] Support BooleanType in
 DataFrame.na.fill

Added support for boolean types in fill and replace methods

Author: Rishabh Bhardwaj <rbnext29@gmail.com>

Closes #9166 from rishabhbhardwaj/master.
---
 .../spark/sql/DataFrameNaFunctions.scala      | 29 ++++++++++++-------
 .../spark/sql/DataFrameNaFunctionsSuite.scala | 14 +++++----
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index 77a42c0873a6b..f7be5f6b370ab 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -198,7 +198,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * Returns a new [[DataFrame]] that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
-   * The value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`.
+   * The value must be of the following type:
+   * `Integer`, `Long`, `Float`, `Double`, `String`, `Boolean`.
    *
    * For example, the following replaces null values in column "A" with string "unknown", and
    * null values in column "B" with numeric value 1.0.
@@ -215,7 +216,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
    * (Scala-specific) Returns a new [[DataFrame]] that replaces null values.
    *
    * The key of the map is the column name, and the value of the map is the replacement value.
-   * The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`.
+   * The value must be of the following type: `Int`, `Long`, `Float`, `Double`, `String`, `Boolean`.
    *
    * For example, the following replaces null values in column "A" with string "unknown", and
    * null values in column "B" with numeric value 1.0.
@@ -232,7 +233,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * Replaces values matching keys in `replacement` map with the corresponding values.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles, strings or booleans.
    * If `col` is "*", then the replacement is applied on all string columns or numeric columns.
    *
    * {{{
@@ -259,7 +261,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * Replaces values matching keys in `replacement` map with the corresponding values.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles, strings or booleans.
    *
    * {{{
    *   import com.google.common.collect.ImmutableMap;
@@ -282,8 +285,10 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * (Scala-specific) Replaces values matching keys in `replacement` map.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
-   * If `col` is "*", then the replacement is applied on all string columns or numeric columns.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles, strings or booleans.
+   * If `col` is "*",
+   * then the replacement is applied on all string columns , numeric columns or boolean columns.
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height".
@@ -311,7 +316,8 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   /**
    * (Scala-specific) Replaces values matching keys in `replacement` map.
-   * Key and value of `replacement` map must have the same type, and can only be doubles or strings.
+   * Key and value of `replacement` map must have the same type, and
+   * can only be doubles , strings or booleans.
    *
    * {{{
    *   // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight".
@@ -333,15 +339,17 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
       return df
     }
 
-    // replacementMap is either Map[String, String] or Map[Double, Double]
+    // replacementMap is either Map[String, String] or Map[Double, Double] or Map[Boolean,Boolean]
     val replacementMap: Map[_, _] = replacement.head._2 match {
       case v: String => replacement
+      case v: Boolean => replacement
       case _ => replacement.map { case (k, v) => (convertToDouble(k), convertToDouble(v)) }
     }
 
-    // targetColumnType is either DoubleType or StringType
+    // targetColumnType is either DoubleType or StringType or BooleanType
     val targetColumnType = replacement.head._1 match {
       case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long => DoubleType
+      case _: jl.Boolean => BooleanType
       case _: String => StringType
     }
 
@@ -367,7 +375,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
       // Check data type
       replaceValue match {
-        case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long | _: String =>
+        case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long | _: jl.Boolean | _: String =>
           // This is good
         case _ => throw new IllegalArgumentException(
           s"Unsupported value type ${replaceValue.getClass.getName} ($replaceValue).")
@@ -382,6 +390,7 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
           case v: jl.Double => fillCol[Double](f, v)
           case v: jl.Long => fillCol[Double](f, v.toDouble)
           case v: jl.Integer => fillCol[Double](f, v.toDouble)
+          case v: jl.Boolean => fillCol[Boolean](f, v.booleanValue())
           case v: String => fillCol[String](f, v)
         }
       }.getOrElse(df.col(f.name))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index 329ffb66083b1..e34875471f093 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -141,24 +141,26 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSQLContext {
   }
 
   test("fill with map") {
-    val df = Seq[(String, String, java.lang.Long, java.lang.Double)](
-      (null, null, null, null)).toDF("a", "b", "c", "d")
+    val df = Seq[(String, String, java.lang.Long, java.lang.Double, java.lang.Boolean)](
+      (null, null, null, null, null)).toDF("a", "b", "c", "d", "e")
     checkAnswer(
       df.na.fill(Map(
         "a" -> "test",
         "c" -> 1,
-        "d" -> 2.2
+        "d" -> 2.2,
+        "e" -> false
       )),
-      Row("test", null, 1, 2.2))
+      Row("test", null, 1, 2.2, false))
 
     // Test Java version
     checkAnswer(
       df.na.fill(Map(
         "a" -> "test",
         "c" -> 1,
-        "d" -> 2.2
+        "d" -> 2.2,
+        "e" -> false
       ).asJava),
-      Row("test", null, 1, 2.2))
+      Row("test", null, 1, 2.2, false))
   }
 
   test("replace") {

From 67582132bffbaaeaadc5cf8218f6239d03c39da0 Mon Sep 17 00:00:00 2001
From: zsxwing <zsxwing@gmail.com>
Date: Mon, 19 Oct 2015 15:35:14 -0700
Subject: [PATCH 103/139] [SPARK-11063] [STREAMING] Change preferredLocations
 of Receiver's RDD to hosts rather than hostports

The format of RDD's preferredLocations must be hostname but the format of Streaming Receiver's scheduling executors is hostport. So it doesn't work.

This PR converts `schedulerExecutors` to `hosts` before creating Receiver's RDD.

Author: zsxwing <zsxwing@gmail.com>

Closes #9075 from zsxwing/SPARK-11063.
---
 .../scheduler/ReceiverSchedulingPolicy.scala  |  3 ++-
 .../streaming/scheduler/ReceiverTracker.scala |  4 +++-
 .../scheduler/ReceiverTrackerSuite.scala      | 24 +++++++++++++++++++
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
index 10b5a7f57a802..d2b0be7f4a9c5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala
@@ -21,6 +21,7 @@ import scala.collection.Map
 import scala.collection.mutable
 
 import org.apache.spark.streaming.receiver.Receiver
+import org.apache.spark.util.Utils
 
 /**
  * A class that tries to schedule receivers with evenly distributed. There are two phases for
@@ -79,7 +80,7 @@ private[streaming] class ReceiverSchedulingPolicy {
       return receivers.map(_.streamId -> Seq.empty).toMap
     }
 
-    val hostToExecutors = executors.groupBy(_.split(":")(0))
+    val hostToExecutors = executors.groupBy(executor => Utils.parseHostPort(executor)._1)
     val scheduledExecutors = Array.fill(receivers.length)(new mutable.ArrayBuffer[String])
     val numReceiversOnExecutor = mutable.HashMap[String, Int]()
     // Set the initial value to 0
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
index d053e9e84910f..2ce80d618b0a3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverTracker.scala
@@ -551,7 +551,9 @@ class ReceiverTracker(ssc: StreamingContext, skipReceiverLaunch: Boolean = false
         if (scheduledExecutors.isEmpty) {
           ssc.sc.makeRDD(Seq(receiver), 1)
         } else {
-          ssc.sc.makeRDD(Seq(receiver -> scheduledExecutors))
+          val preferredLocations =
+            scheduledExecutors.map(hostPort => Utils.parseHostPort(hostPort)._1).distinct
+          ssc.sc.makeRDD(Seq(receiver -> preferredLocations))
         }
       receiverRDD.setName(s"Receiver $receiverId")
       ssc.sparkContext.setJobDescription(s"Streaming job running receiver $receiverId")
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
index 45138b748ecab..fda86aef457d4 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ReceiverTrackerSuite.scala
@@ -22,6 +22,8 @@ import scala.collection.mutable.ArrayBuffer
 import org.scalatest.concurrent.Eventually._
 import org.scalatest.time.SpanSugar._
 
+import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskStart, TaskLocality}
+import org.apache.spark.scheduler.TaskLocality.TaskLocality
 import org.apache.spark.storage.{StorageLevel, StreamBlockId}
 import org.apache.spark.streaming._
 import org.apache.spark.streaming.dstream.ReceiverInputDStream
@@ -80,6 +82,28 @@ class ReceiverTrackerSuite extends TestSuiteBase {
       }
     }
   }
+
+  test("SPARK-11063: TaskSetManager should use Receiver RDD's preferredLocations") {
+    // Use ManualClock to prevent from starting batches so that we can make sure the only task is
+    // for starting the Receiver
+    val _conf = conf.clone.set("spark.streaming.clock", "org.apache.spark.util.ManualClock")
+    withStreamingContext(new StreamingContext(_conf, Milliseconds(100))) { ssc =>
+      @volatile var receiverTaskLocality: TaskLocality = null
+      ssc.sparkContext.addSparkListener(new SparkListener {
+        override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
+          receiverTaskLocality = taskStart.taskInfo.taskLocality
+        }
+      })
+      val input = ssc.receiverStream(new TestReceiver)
+      val output = new TestOutputStream(input)
+      output.register()
+      ssc.start()
+      eventually(timeout(10 seconds), interval(10 millis)) {
+        // If preferredLocations is set correctly, receiverTaskLocality should be NODE_LOCAL
+        assert(receiverTaskLocality === TaskLocality.NODE_LOCAL)
+      }
+    }
+  }
 }
 
 /** An input DStream with for testing rate controlling */

From 7ab0ce6501c37f0fc3a49e3332573ae4e4def3e8 Mon Sep 17 00:00:00 2001
From: Marcelo Vanzin <vanzin@cloudera.com>
Date: Mon, 19 Oct 2015 16:14:50 -0700
Subject: [PATCH 104/139] [SPARK-11131][CORE] Fix race in worker registration
 protocol.

Because the registration RPC was not really an RPC, but a bunch of
disconnected messages, it was possible for other messages to be
sent before the reply to the registration arrived, and that would
confuse the Worker. Especially in local-cluster mode, the worker was
succeptible to receiving an executor request before it received a
message from the master saying registration succeeded.

On top of the above, the change also fixes a ClassCastException when
the registration fails, which also affects the executor registration
protocol. Because the `ask` is issued with a specific return type,
if the error message (of a different type) was returned instead, the
code would just die with an exception. This is fixed by having a common
base trait for these reply messages.

Author: Marcelo Vanzin <vanzin@cloudera.com>

Closes #9138 from vanzin/SPARK-11131.
---
 .../apache/spark/deploy/DeployMessage.scala   |  7 +-
 .../apache/spark/deploy/master/Master.scala   | 50 ++++++-------
 .../apache/spark/deploy/worker/Worker.scala   | 73 ++++++++++++-------
 .../CoarseGrainedExecutorBackend.scala        |  4 +-
 .../cluster/CoarseGrainedClusterMessage.scala |  4 +
 .../apache/spark/HeartbeatReceiverSuite.scala |  4 +-
 6 files changed, 86 insertions(+), 56 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
index d8084a57658ad..3feb7cea593e0 100644
--- a/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/DeployMessage.scala
@@ -69,9 +69,14 @@ private[deploy] object DeployMessages {
 
   // Master to Worker
 
+  sealed trait RegisterWorkerResponse
+
   case class RegisteredWorker(master: RpcEndpointRef, masterWebUiUrl: String) extends DeployMessage
+    with RegisterWorkerResponse
+
+  case class RegisterWorkerFailed(message: String) extends DeployMessage with RegisterWorkerResponse
 
-  case class RegisterWorkerFailed(message: String) extends DeployMessage
+  case object MasterInStandby extends DeployMessage with RegisterWorkerResponse
 
   case class ReconnectWorker(masterUrl: String) extends DeployMessage
 
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index d518e92133aad..6715d6c70f497 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -233,31 +233,6 @@ private[deploy] class Master(
       System.exit(0)
     }
 
-    case RegisterWorker(
-        id, workerHost, workerPort, workerRef, cores, memory, workerUiPort, publicAddress) => {
-      logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
-        workerHost, workerPort, cores, Utils.megabytesToString(memory)))
-      if (state == RecoveryState.STANDBY) {
-        // ignore, don't send response
-      } else if (idToWorker.contains(id)) {
-        workerRef.send(RegisterWorkerFailed("Duplicate worker ID"))
-      } else {
-        val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
-          workerRef, workerUiPort, publicAddress)
-        if (registerWorker(worker)) {
-          persistenceEngine.addWorker(worker)
-          workerRef.send(RegisteredWorker(self, masterWebUiUrl))
-          schedule()
-        } else {
-          val workerAddress = worker.endpoint.address
-          logWarning("Worker registration failed. Attempted to re-register worker at same " +
-            "address: " + workerAddress)
-          workerRef.send(RegisterWorkerFailed("Attempted to re-register worker at same address: "
-            + workerAddress))
-        }
-      }
-    }
-
     case RegisterApplication(description, driver) => {
       // TODO Prevent repeated registrations from some driver
       if (state == RecoveryState.STANDBY) {
@@ -387,6 +362,31 @@ private[deploy] class Master(
   }
 
   override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
+    case RegisterWorker(
+        id, workerHost, workerPort, workerRef, cores, memory, workerUiPort, publicAddress) => {
+      logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
+        workerHost, workerPort, cores, Utils.megabytesToString(memory)))
+      if (state == RecoveryState.STANDBY) {
+        context.reply(MasterInStandby)
+      } else if (idToWorker.contains(id)) {
+        context.reply(RegisterWorkerFailed("Duplicate worker ID"))
+      } else {
+        val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
+          workerRef, workerUiPort, publicAddress)
+        if (registerWorker(worker)) {
+          persistenceEngine.addWorker(worker)
+          context.reply(RegisteredWorker(self, masterWebUiUrl))
+          schedule()
+        } else {
+          val workerAddress = worker.endpoint.address
+          logWarning("Worker registration failed. Attempted to re-register worker at same " +
+            "address: " + workerAddress)
+          context.reply(RegisterWorkerFailed("Attempted to re-register worker at same address: "
+            + workerAddress))
+        }
+      }
+    }
+
     case RequestSubmitDriver(description) => {
       if (state != RecoveryState.ALIVE) {
         val msg = s"${Utils.BACKUP_STANDALONE_MASTER_PREFIX}: $state. " +
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
index 93a1b3f310422..a45867e7680ec 100755
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -26,7 +26,7 @@ import java.util.concurrent.{Future => JFuture, ScheduledFuture => JScheduledFut
 
 import scala.collection.mutable.{HashMap, HashSet, LinkedHashMap}
 import scala.concurrent.ExecutionContext
-import scala.util.Random
+import scala.util.{Failure, Random, Success}
 import scala.util.control.NonFatal
 
 import org.apache.spark.{Logging, SecurityManager, SparkConf}
@@ -213,8 +213,7 @@ private[deploy] class Worker(
             logInfo("Connecting to master " + masterAddress + "...")
             val masterEndpoint =
               rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME)
-            masterEndpoint.send(RegisterWorker(
-              workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
+            registerWithMaster(masterEndpoint)
           } catch {
             case ie: InterruptedException => // Cancelled
             case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
@@ -271,8 +270,7 @@ private[deploy] class Worker(
                   logInfo("Connecting to master " + masterAddress + "...")
                   val masterEndpoint =
                     rpcEnv.setupEndpointRef(Master.SYSTEM_NAME, masterAddress, Master.ENDPOINT_NAME)
-                  masterEndpoint.send(RegisterWorker(
-                    workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
+                  registerWithMaster(masterEndpoint)
                 } catch {
                   case ie: InterruptedException => // Cancelled
                   case NonFatal(e) => logWarning(s"Failed to connect to master $masterAddress", e)
@@ -341,25 +339,54 @@ private[deploy] class Worker(
     }
   }
 
-  override def receive: PartialFunction[Any, Unit] = {
-    case RegisteredWorker(masterRef, masterWebUiUrl) =>
-      logInfo("Successfully registered with master " + masterRef.address.toSparkURL)
-      registered = true
-      changeMaster(masterRef, masterWebUiUrl)
-      forwordMessageScheduler.scheduleAtFixedRate(new Runnable {
-        override def run(): Unit = Utils.tryLogNonFatalError {
-          self.send(SendHeartbeat)
-        }
-      }, 0, HEARTBEAT_MILLIS, TimeUnit.MILLISECONDS)
-      if (CLEANUP_ENABLED) {
-        logInfo(s"Worker cleanup enabled; old application directories will be deleted in: $workDir")
+  private def registerWithMaster(masterEndpoint: RpcEndpointRef): Unit = {
+    masterEndpoint.ask[RegisterWorkerResponse](RegisterWorker(
+      workerId, host, port, self, cores, memory, webUi.boundPort, publicAddress))
+      .onComplete {
+        // This is a very fast action so we can use "ThreadUtils.sameThread"
+        case Success(msg) =>
+          Utils.tryLogNonFatalError {
+            handleRegisterResponse(msg)
+          }
+        case Failure(e) =>
+          logError(s"Cannot register with master: ${masterEndpoint.address}", e)
+          System.exit(1)
+      }(ThreadUtils.sameThread)
+  }
+
+  private def handleRegisterResponse(msg: RegisterWorkerResponse): Unit = synchronized {
+    msg match {
+      case RegisteredWorker(masterRef, masterWebUiUrl) =>
+        logInfo("Successfully registered with master " + masterRef.address.toSparkURL)
+        registered = true
+        changeMaster(masterRef, masterWebUiUrl)
         forwordMessageScheduler.scheduleAtFixedRate(new Runnable {
           override def run(): Unit = Utils.tryLogNonFatalError {
-            self.send(WorkDirCleanup)
+            self.send(SendHeartbeat)
           }
-        }, CLEANUP_INTERVAL_MILLIS, CLEANUP_INTERVAL_MILLIS, TimeUnit.MILLISECONDS)
-      }
+        }, 0, HEARTBEAT_MILLIS, TimeUnit.MILLISECONDS)
+        if (CLEANUP_ENABLED) {
+          logInfo(
+            s"Worker cleanup enabled; old application directories will be deleted in: $workDir")
+          forwordMessageScheduler.scheduleAtFixedRate(new Runnable {
+            override def run(): Unit = Utils.tryLogNonFatalError {
+              self.send(WorkDirCleanup)
+            }
+          }, CLEANUP_INTERVAL_MILLIS, CLEANUP_INTERVAL_MILLIS, TimeUnit.MILLISECONDS)
+        }
 
+      case RegisterWorkerFailed(message) =>
+        if (!registered) {
+          logError("Worker registration failed: " + message)
+          System.exit(1)
+        }
+
+      case MasterInStandby =>
+        // Ignore. Master not yet ready.
+    }
+  }
+
+  override def receive: PartialFunction[Any, Unit] = synchronized {
     case SendHeartbeat =>
       if (connected) { sendToMaster(Heartbeat(workerId, self)) }
 
@@ -399,12 +426,6 @@ private[deploy] class Worker(
         map(e => new ExecutorDescription(e.appId, e.execId, e.cores, e.state))
       masterRef.send(WorkerSchedulerStateResponse(workerId, execs.toList, drivers.keys.toSeq))
 
-    case RegisterWorkerFailed(message) =>
-      if (!registered) {
-        logError("Worker registration failed: " + message)
-        System.exit(1)
-      }
-
     case ReconnectWorker(masterUrl) =>
       logInfo(s"Master with url $masterUrl requested this worker to reconnect.")
       registerWithMaster()
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 49059de50b42b..a9c6a05ecd434 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -59,12 +59,12 @@ private[spark] class CoarseGrainedExecutorBackend(
     rpcEnv.asyncSetupEndpointRefByURI(driverUrl).flatMap { ref =>
       // This is a very fast action so we can use "ThreadUtils.sameThread"
       driver = Some(ref)
-      ref.ask[RegisteredExecutor.type](
+      ref.ask[RegisterExecutorResponse](
         RegisterExecutor(executorId, self, hostPort, cores, extractLogUrls))
     }(ThreadUtils.sameThread).onComplete {
       // This is a very fast action so we can use "ThreadUtils.sameThread"
       case Success(msg) => Utils.tryLogNonFatalError {
-        Option(self).foreach(_.send(msg)) // msg must be RegisteredExecutor
+        Option(self).foreach(_.send(msg)) // msg must be RegisterExecutorResponse
       }
       case Failure(e) => {
         logError(s"Cannot register with driver: $driverUrl", e)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
index e0d25dc50c988..4652df32efa74 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -36,9 +36,13 @@ private[spark] object CoarseGrainedClusterMessages {
   case class KillTask(taskId: Long, executor: String, interruptThread: Boolean)
     extends CoarseGrainedClusterMessage
 
+  sealed trait RegisterExecutorResponse
+
   case object RegisteredExecutor extends CoarseGrainedClusterMessage
+    with RegisterExecutorResponse
 
   case class RegisterExecutorFailed(message: String) extends CoarseGrainedClusterMessage
+    with RegisterExecutorResponse
 
   // Executors to driver
   case class RegisterExecutor(
diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
index 18f2229fea39b..3cd80c0f7d171 100644
--- a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -173,9 +173,9 @@ class HeartbeatReceiverSuite
     val dummyExecutorEndpoint2 = new FakeExecutorEndpoint(rpcEnv)
     val dummyExecutorEndpointRef1 = rpcEnv.setupEndpoint("fake-executor-1", dummyExecutorEndpoint1)
     val dummyExecutorEndpointRef2 = rpcEnv.setupEndpoint("fake-executor-2", dummyExecutorEndpoint2)
-    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
+    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisterExecutorResponse](
       RegisterExecutor(executorId1, dummyExecutorEndpointRef1, "dummy:4040", 0, Map.empty))
-    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisteredExecutor.type](
+    fakeSchedulerBackend.driverEndpoint.askWithRetry[RegisterExecutorResponse](
       RegisterExecutor(executorId2, dummyExecutorEndpointRef2, "dummy:4040", 0, Map.empty))
     heartbeatReceiverRef.askWithRetry[Boolean](TaskSchedulerIsSet)
     addExecutorAndVerify(executorId1)

From a1413b3662250dd5e980e8b1f7c3dc4585ab4766 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@appier.com>
Date: Mon, 19 Oct 2015 16:16:31 -0700
Subject: [PATCH 105/139] [SPARK-11051][CORE] Do not allow local checkpointing
 after the RDD is materialized and checkpointed

JIRA: https://issues.apache.org/jira/browse/SPARK-11051

When a `RDD` is materialized and checkpointed, its partitions and dependencies are cleared. If we allow local checkpointing on it and assign `LocalRDDCheckpointData` to its `checkpointData`. Next time when the RDD is materialized again, the error will be thrown.

Author: Liang-Chi Hsieh <viirya@appier.com>

Closes #9072 from viirya/no-localcheckpoint-after-checkpoint.
---
 .../main/scala/org/apache/spark/rdd/RDD.scala | 35 +++++++++++++++----
 .../org/apache/spark/CheckpointSuite.scala    |  4 +++
 2 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index a56e542242d5f..a97bb174438a5 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -294,7 +294,11 @@ abstract class RDD[T: ClassTag](
    */
   private[spark] def computeOrReadCheckpoint(split: Partition, context: TaskContext): Iterator[T] =
   {
-    if (isCheckpointed) firstParent[T].iterator(split, context) else compute(split, context)
+    if (isCheckpointedAndMaterialized) {
+      firstParent[T].iterator(split, context)
+    } else {
+      compute(split, context)
+    }
   }
 
   /**
@@ -1520,20 +1524,37 @@ abstract class RDD[T: ClassTag](
       persist(LocalRDDCheckpointData.transformStorageLevel(storageLevel), allowOverride = true)
     }
 
-    checkpointData match {
-      case Some(reliable: ReliableRDDCheckpointData[_]) => logWarning(
-        "RDD was already marked for reliable checkpointing: overriding with local checkpoint.")
-      case _ =>
+    // If this RDD is already checkpointed and materialized, its lineage is already truncated.
+    // We must not override our `checkpointData` in this case because it is needed to recover
+    // the checkpointed data. If it is overridden, next time materializing on this RDD will
+    // cause error.
+    if (isCheckpointedAndMaterialized) {
+      logWarning("Not marking RDD for local checkpoint because it was already " +
+        "checkpointed and materialized")
+    } else {
+      // Lineage is not truncated yet, so just override any existing checkpoint data with ours
+      checkpointData match {
+        case Some(_: ReliableRDDCheckpointData[_]) => logWarning(
+          "RDD was already marked for reliable checkpointing: overriding with local checkpoint.")
+        case _ =>
+      }
+      checkpointData = Some(new LocalRDDCheckpointData(this))
     }
-    checkpointData = Some(new LocalRDDCheckpointData(this))
     this
   }
 
   /**
-   * Return whether this RDD is marked for checkpointing, either reliably or locally.
+   * Return whether this RDD is checkpointed and materialized, either reliably or locally.
    */
   def isCheckpointed: Boolean = checkpointData.exists(_.isCheckpointed)
 
+  /**
+   * Return whether this RDD is checkpointed and materialized, either reliably or locally.
+   * This is introduced as an alias for `isCheckpointed` to clarify the semantics of the
+   * return value. Exposed for testing.
+   */
+  private[spark] def isCheckpointedAndMaterialized: Boolean = isCheckpointed
+
   /**
    * Return whether this RDD is marked for local checkpointing.
    * Exposed for testing.
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index 4d70bfed909b6..119e5fc28e412 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -241,9 +241,13 @@ class CheckpointSuite extends SparkFunSuite with LocalSparkContext with Logging
     val rdd = new BlockRDD[Int](sc, Array[BlockId]())
     assert(rdd.partitions.size === 0)
     assert(rdd.isCheckpointed === false)
+    assert(rdd.isCheckpointedAndMaterialized === false)
     checkpoint(rdd, reliableCheckpoint)
+    assert(rdd.isCheckpointed === false)
+    assert(rdd.isCheckpointedAndMaterialized === false)
     assert(rdd.count() === 0)
     assert(rdd.isCheckpointed === true)
+    assert(rdd.isCheckpointedAndMaterialized === true)
     assert(rdd.partitions.size === 0)
   }
 

From 232d7f8d42950431f1d9be2a6bb3591fb6ea20d6 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Mon, 19 Oct 2015 16:18:20 -0700
Subject: [PATCH 106/139] [SPARK-11114][PYSPARK] add getOrCreate for
 SparkContext/SQLContext in Python

Also added SQLContext.newSession()

Author: Davies Liu <davies@databricks.com>

Closes #9122 from davies/py_create.
---
 python/pyspark/context.py     | 16 ++++++++++++++--
 python/pyspark/sql/context.py | 27 +++++++++++++++++++++++++++
 python/pyspark/sql/tests.py   | 14 ++++++++++++++
 python/pyspark/tests.py       |  4 ++++
 4 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 4969d85f52b23..afd74d937a413 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -21,7 +21,7 @@
 import shutil
 import signal
 import sys
-from threading import Lock
+from threading import RLock
 from tempfile import NamedTemporaryFile
 
 from pyspark import accumulators
@@ -65,7 +65,7 @@ class SparkContext(object):
     _jvm = None
     _next_accum_id = 0
     _active_spark_context = None
-    _lock = Lock()
+    _lock = RLock()
     _python_includes = None  # zip and egg files that need to be added to PYTHONPATH
 
     PACKAGE_EXTENSIONS = ('.zip', '.egg', '.jar')
@@ -280,6 +280,18 @@ def __exit__(self, type, value, trace):
         """
         self.stop()
 
+    @classmethod
+    def getOrCreate(cls, conf=None):
+        """
+        Get or instantiate a SparkContext and register it as a singleton object.
+
+        :param conf: SparkConf (optional)
+        """
+        with SparkContext._lock:
+            if SparkContext._active_spark_context is None:
+                SparkContext(conf=conf or SparkConf())
+            return SparkContext._active_spark_context
+
     def setLogLevel(self, logLevel):
         """
         Control our logLevel. This overrides any user-defined log settings.
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index 89c8c6e0d94f1..79453658a167a 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -75,6 +75,8 @@ class SQLContext(object):
         SQLContext in the JVM, instead we make all calls to this object.
     """
 
+    _instantiatedContext = None
+
     @ignore_unicode_prefix
     def __init__(self, sparkContext, sqlContext=None):
         """Creates a new SQLContext.
@@ -99,6 +101,8 @@ def __init__(self, sparkContext, sqlContext=None):
         self._scala_SQLContext = sqlContext
         _monkey_patch_RDD(self)
         install_exception_handler()
+        if SQLContext._instantiatedContext is None:
+            SQLContext._instantiatedContext = self
 
     @property
     def _ssql_ctx(self):
@@ -111,6 +115,29 @@ def _ssql_ctx(self):
             self._scala_SQLContext = self._jvm.SQLContext(self._jsc.sc())
         return self._scala_SQLContext
 
+    @classmethod
+    @since(1.6)
+    def getOrCreate(cls, sc):
+        """
+        Get the existing SQLContext or create a new one with given SparkContext.
+
+        :param sc: SparkContext
+        """
+        if cls._instantiatedContext is None:
+            jsqlContext = sc._jvm.SQLContext.getOrCreate(sc._jsc.sc())
+            cls(sc, jsqlContext)
+        return cls._instantiatedContext
+
+    @since(1.6)
+    def newSession(self):
+        """
+        Returns a new SQLContext as new session, that has separate SQLConf,
+        registered temporary tables and UDFs, but shared SparkContext and
+        table cache.
+        """
+        jsqlContext = self._ssql_ctx.newSession()
+        return self.__class__(self._sc, jsqlContext)
+
     @since(1.3)
     def setConf(self, key, value):
         """Sets the given Spark SQL configuration property.
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 645133b2b2d84..f465e1fa20941 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -174,6 +174,20 @@ def test_datetype_equal_zero(self):
         self.assertEqual(dt.fromInternal(0), datetime.date(1970, 1, 1))
 
 
+class SQLContextTests(ReusedPySparkTestCase):
+    def test_get_or_create(self):
+        sqlCtx = SQLContext.getOrCreate(self.sc)
+        self.assertTrue(SQLContext.getOrCreate(self.sc) is sqlCtx)
+
+    def test_new_session(self):
+        sqlCtx = SQLContext.getOrCreate(self.sc)
+        sqlCtx.setConf("test_key", "a")
+        sqlCtx2 = sqlCtx.newSession()
+        sqlCtx2.setConf("test_key", "b")
+        self.assertEqual(sqlCtx.getConf("test_key", ""), "a")
+        self.assertEqual(sqlCtx2.getConf("test_key", ""), "b")
+
+
 class SQLTests(ReusedPySparkTestCase):
 
     @classmethod
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 63cc87e0c4b2c..3c51809444401 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -1883,6 +1883,10 @@ def test_failed_sparkcontext_creation(self):
         # Regression test for SPARK-1550
         self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name"))
 
+    def test_get_or_create(self):
+        with SparkContext.getOrCreate() as sc:
+            self.assertTrue(SparkContext.getOrCreate() is sc)
+
     def test_stop(self):
         sc = SparkContext()
         self.assertNotEqual(SparkContext._active_spark_context, None)

From fc26f32cf1bede8b9a1343dca0c0182107c9985e Mon Sep 17 00:00:00 2001
From: Chris Bannister <chris.bannister@swiftkey.com>
Date: Mon, 19 Oct 2015 16:24:40 -0700
Subject: [PATCH 107/139] [SPARK-9708][MESOS] Spark should create local
 temporary directories in Mesos sandbox when launched with Mesos

This is my own original work and I license this to the project under the project's open source license

Author: Chris Bannister <chris.bannister@swiftkey.com>
Author: Chris Bannister <chris.bannister@swiftkey.net>

Closes #8358 from Zariel/mesos-local-dir.
---
 .../scala/org/apache/spark/util/Utils.scala     | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 22c05a2479422..55950405f0488 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -649,6 +649,7 @@ private[spark] object Utils extends Logging {
    * logic of locating the local directories according to deployment mode.
    */
   def getConfiguredLocalDirs(conf: SparkConf): Array[String] = {
+    val shuffleServiceEnabled = conf.getBoolean("spark.shuffle.service.enabled", false)
     if (isRunningInYarnContainer(conf)) {
       // If we are in yarn mode, systems can have different disk layouts so we must set it
       // to what Yarn on this system said was available. Note this assumes that Yarn has
@@ -657,13 +658,23 @@ private[spark] object Utils extends Logging {
       getYarnLocalDirs(conf).split(",")
     } else if (conf.getenv("SPARK_EXECUTOR_DIRS") != null) {
       conf.getenv("SPARK_EXECUTOR_DIRS").split(File.pathSeparator)
+    } else if (conf.getenv("SPARK_LOCAL_DIRS") != null) {
+      conf.getenv("SPARK_LOCAL_DIRS").split(",")
+    } else if (conf.getenv("MESOS_DIRECTORY") != null && !shuffleServiceEnabled) {
+      // Mesos already creates a directory per Mesos task. Spark should use that directory
+      // instead so all temporary files are automatically cleaned up when the Mesos task ends.
+      // Note that we don't want this if the shuffle service is enabled because we want to
+      // continue to serve shuffle files after the executors that wrote them have already exited.
+      Array(conf.getenv("MESOS_DIRECTORY"))
     } else {
+      if (conf.getenv("MESOS_DIRECTORY") != null && shuffleServiceEnabled) {
+        logInfo("MESOS_DIRECTORY available but not using provided Mesos sandbox because " +
+          "spark.shuffle.service.enabled is enabled.")
+      }
       // In non-Yarn mode (or for the driver in yarn-client mode), we cannot trust the user
       // configuration to point to a secure directory. So create a subdirectory with restricted
       // permissions under each listed directory.
-      Option(conf.getenv("SPARK_LOCAL_DIRS"))
-        .getOrElse(conf.get("spark.local.dir", System.getProperty("java.io.tmpdir")))
-        .split(",")
+      conf.get("spark.local.dir", System.getProperty("java.io.tmpdir")).split(",")
     }
   }
 

From 16906ef23a7aa2854c8cdcaa3bb3808ab39e0eec Mon Sep 17 00:00:00 2001
From: Ryan Williams <ryan.blake.williams@gmail.com>
Date: Mon, 19 Oct 2015 16:34:15 -0700
Subject: [PATCH 108/139] [SPARK-11120] Allow sane default number of executor
 failures when dynamically allocating in YARN

I also added some information to container-failure error msgs about what host they failed on, which would have helped me identify the problem that lead me to this JIRA and PR sooner.

Author: Ryan Williams <ryan.blake.williams@gmail.com>

Closes #9147 from ryan-williams/dyn-exec-failures.
---
 .../scala/org/apache/spark/SparkConf.scala    |  4 +++-
 .../spark/deploy/yarn/ApplicationMaster.scala | 19 +++++++++++++++----
 .../spark/deploy/yarn/YarnAllocator.scala     | 19 +++++++++++--------
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 1a0ac3d01759c..58d3b846fd80d 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -595,7 +595,9 @@ private[spark] object SparkConf extends Logging {
     "spark.rpc.lookupTimeout" -> Seq(
       AlternateConfig("spark.akka.lookupTimeout", "1.4")),
     "spark.streaming.fileStream.minRememberDuration" -> Seq(
-      AlternateConfig("spark.streaming.minRememberDuration", "1.5"))
+      AlternateConfig("spark.streaming.minRememberDuration", "1.5")),
+    "spark.yarn.max.executor.failures" -> Seq(
+      AlternateConfig("spark.yarn.max.worker.failures", "1.5"))
     )
 
   /**
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
index d1d248bf79beb..4b4d9990ce9f9 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -62,10 +62,21 @@ private[spark] class ApplicationMaster(
     .asInstanceOf[YarnConfiguration]
   private val isClusterMode = args.userClass != null
 
-  // Default to numExecutors * 2, with minimum of 3
-  private val maxNumExecutorFailures = sparkConf.getInt("spark.yarn.max.executor.failures",
-    sparkConf.getInt("spark.yarn.max.worker.failures",
-      math.max(sparkConf.getInt("spark.executor.instances", 0) *  2, 3)))
+  // Default to twice the number of executors (twice the maximum number of executors if dynamic
+  // allocation is enabled), with a minimum of 3.
+
+  private val maxNumExecutorFailures = {
+    val defaultKey =
+      if (Utils.isDynamicAllocationEnabled(sparkConf)) {
+        "spark.dynamicAllocation.maxExecutors"
+      } else {
+        "spark.executor.instances"
+      }
+    val effectiveNumExecutors = sparkConf.getInt(defaultKey, 0)
+    val defaultMaxNumExecutorFailures = math.max(3, 2 * effectiveNumExecutors)
+
+    sparkConf.getInt("spark.yarn.max.executor.failures", defaultMaxNumExecutorFailures)
+  }
 
   @volatile private var exitCode = 0
   @volatile private var unregistered = false
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index 9e1ef1b3b4229..1deaa3743ddfa 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -430,17 +430,20 @@ private[yarn] class YarnAllocator(
     for (completedContainer <- completedContainers) {
       val containerId = completedContainer.getContainerId
       val alreadyReleased = releasedContainers.remove(containerId)
+      val hostOpt = allocatedContainerToHostMap.get(containerId)
+      val onHostStr = hostOpt.map(host => s" on host: $host").getOrElse("")
       val exitReason = if (!alreadyReleased) {
         // Decrement the number of executors running. The next iteration of
         // the ApplicationMaster's reporting thread will take care of allocating.
         numExecutorsRunning -= 1
-        logInfo("Completed container %s (state: %s, exit status: %s)".format(
+        logInfo("Completed container %s%s (state: %s, exit status: %s)".format(
           containerId,
+          onHostStr,
           completedContainer.getState,
           completedContainer.getExitStatus))
         // Hadoop 2.2.X added a ContainerExitStatus we should switch to use
         // there are some exit status' we shouldn't necessarily count against us, but for
-        // now I think its ok as none of the containers are expected to exit
+        // now I think its ok as none of the containers are expected to exit.
         val exitStatus = completedContainer.getExitStatus
         val (isNormalExit, containerExitReason) = exitStatus match {
           case ContainerExitStatus.SUCCESS =>
@@ -449,7 +452,7 @@ private[yarn] class YarnAllocator(
             // Preemption should count as a normal exit, since YARN preempts containers merely
             // to do resource sharing, and tasks that fail due to preempted executors could
             // just as easily finish on any other executor. See SPARK-8167.
-            (true, s"Container $containerId was preempted.")
+            (true, s"Container ${containerId}${onHostStr} was preempted.")
           // Should probably still count memory exceeded exit codes towards task failures
           case VMEM_EXCEEDED_EXIT_CODE =>
             (false, memLimitExceededLogMessage(
@@ -461,7 +464,7 @@ private[yarn] class YarnAllocator(
               PMEM_EXCEEDED_PATTERN))
           case unknown =>
             numExecutorsFailed += 1
-            (false, "Container marked as failed: " + containerId +
+            (false, "Container marked as failed: " + containerId + onHostStr +
               ". Exit status: " + completedContainer.getExitStatus +
               ". Diagnostics: " + completedContainer.getDiagnostics)
 
@@ -479,10 +482,10 @@ private[yarn] class YarnAllocator(
           s"Container $containerId exited from explicit termination request.")
       }
 
-      if (allocatedContainerToHostMap.contains(containerId)) {
-        val host = allocatedContainerToHostMap.get(containerId).get
-        val containerSet = allocatedHostToContainersMap.get(host).get
-
+      for {
+        host <- hostOpt
+        containerSet <- allocatedHostToContainersMap.get(host)
+      } {
         containerSet.remove(containerId)
         if (containerSet.isEmpty) {
           allocatedHostToContainersMap.remove(host)

From 8b877cc4ee46cad9d1f7cac451801c1410f6c1fe Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Mon, 19 Oct 2015 16:57:20 -0700
Subject: [PATCH 109/139] [SPARK-11088][SQL] Merges partition values using
 UnsafeProjection

`DataSourceStrategy.mergeWithPartitionValues` is essentially a projection implemented in a quite inefficient way. This PR optimizes this method with `UnsafeProjection` to avoid unnecessary boxing costs.

Author: Cheng Lian <lian@databricks.com>

Closes #9104 from liancheng/spark-11088.faster-partition-values-merging.
---
 .../datasources/DataSourceStrategy.scala      | 73 ++++++-------------
 1 file changed, 24 insertions(+), 49 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index 33181fa6c065f..ffb4645b89321 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -140,29 +140,30 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     val sharedHadoopConf = SparkHadoopUtil.get.conf
     val confBroadcast =
       relation.sqlContext.sparkContext.broadcast(new SerializableConfiguration(sharedHadoopConf))
+    val partitionColumnNames = partitionColumns.fieldNames.toSet
 
     // Now, we create a scan builder, which will be used by pruneFilterProject. This scan builder
     // will union all partitions and attach partition values if needed.
     val scanBuilder = {
-      (columns: Seq[Attribute], filters: Array[Filter]) => {
+      (requiredColumns: Seq[Attribute], filters: Array[Filter]) => {
+        val requiredDataColumns =
+          requiredColumns.filterNot(c => partitionColumnNames.contains(c.name))
+
         // Builds RDD[Row]s for each selected partition.
         val perPartitionRows = partitions.map { case Partition(partitionValues, dir) =>
-          val partitionColNames = partitionColumns.fieldNames
-
           // Don't scan any partition columns to save I/O.  Here we are being optimistic and
           // assuming partition columns data stored in data files are always consistent with those
           // partition values encoded in partition directory paths.
-          val needed = columns.filterNot(a => partitionColNames.contains(a.name))
-          val dataRows =
-            relation.buildScan(needed.map(_.name).toArray, filters, Array(dir), confBroadcast)
+          val dataRows = relation.buildScan(
+            requiredDataColumns.map(_.name).toArray, filters, Array(dir), confBroadcast)
 
           // Merges data values with partition values.
           mergeWithPartitionValues(
-            relation.schema,
-            columns.map(_.name).toArray,
-            partitionColNames,
+            requiredColumns,
+            requiredDataColumns,
+            partitionColumns,
             partitionValues,
-            toCatalystRDD(logicalRelation, needed, dataRows))
+            toCatalystRDD(logicalRelation, requiredDataColumns, dataRows))
         }
 
         val unionedRows =
@@ -188,52 +189,27 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
     sparkPlan
   }
 
-  // TODO: refactor this thing. It is very complicated because it does projection internally.
-  // We should just put a project on top of this.
   private def mergeWithPartitionValues(
-      schema: StructType,
-      requiredColumns: Array[String],
-      partitionColumns: Array[String],
+      requiredColumns: Seq[Attribute],
+      dataColumns: Seq[Attribute],
+      partitionColumnSchema: StructType,
       partitionValues: InternalRow,
       dataRows: RDD[InternalRow]): RDD[InternalRow] = {
-    val nonPartitionColumns = requiredColumns.filterNot(partitionColumns.contains)
-
     // If output columns contain any partition column(s), we need to merge scanned data
     // columns and requested partition columns to form the final result.
-    if (!requiredColumns.sameElements(nonPartitionColumns)) {
-      val mergers = requiredColumns.zipWithIndex.map { case (name, index) =>
-        // To see whether the `index`-th column is a partition column...
-        val i = partitionColumns.indexOf(name)
-        if (i != -1) {
-          val dt = schema(partitionColumns(i)).dataType
-          // If yes, gets column value from partition values.
-          (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = partitionValues.get(i, dt)
-          }
-        } else {
-          // Otherwise, inherits the value from scanned data.
-          val i = nonPartitionColumns.indexOf(name)
-          val dt = schema(nonPartitionColumns(i)).dataType
-          (mutableRow: MutableRow, dataRow: InternalRow, ordinal: Int) => {
-            mutableRow(ordinal) = dataRow.get(i, dt)
-          }
-        }
+    if (requiredColumns != dataColumns) {
+      // Builds `AttributeReference`s for all partition columns so that we can use them to project
+      // required partition columns.  Note that if a partition column appears in `requiredColumns`,
+      // we should use the `AttributeReference` in `requiredColumns`.
+      val requiredColumnMap = requiredColumns.map(a => a.name -> a).toMap
+      val partitionColumns = partitionColumnSchema.toAttributes.map { a =>
+        requiredColumnMap.getOrElse(a.name, a)
       }
 
-      // Since we know for sure that this closure is serializable, we can avoid the overhead
-      // of cleaning a closure for each RDD by creating our own MapPartitionsRDD. Functionally
-      // this is equivalent to calling `dataRows.mapPartitions(mapPartitionsFunc)` (SPARK-7718).
       val mapPartitionsFunc = (_: TaskContext, _: Int, iterator: Iterator[InternalRow]) => {
-        val dataTypes = requiredColumns.map(schema(_).dataType)
-        val mutableRow = new SpecificMutableRow(dataTypes)
-        iterator.map { dataRow =>
-          var i = 0
-          while (i < mutableRow.numFields) {
-            mergers(i)(mutableRow, dataRow, i)
-            i += 1
-          }
-          mutableRow.asInstanceOf[InternalRow]
-        }
+        val projection = UnsafeProjection.create(requiredColumns, dataColumns ++ partitionColumns)
+        val mutableJoinedRow = new JoinedRow()
+        iterator.map(dataRow => projection(mutableJoinedRow(dataRow, partitionValues)))
       }
 
       // This is an internal RDD whose call site the user should not be concerned with
@@ -242,7 +218,6 @@ private[sql] object DataSourceStrategy extends Strategy with Logging {
       Utils.withDummyCallSite(dataRows.sparkContext) {
         new MapPartitionsRDD(dataRows, mapPartitionsFunc, preservesPartitioning = false)
       }
-
     } else {
       dataRows
     }

From 8f74aa639759f400120794355511327fa74905da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jean-Baptiste=20Onofr=C3=A9?= <jbonofre@apache.org>
Date: Tue, 20 Oct 2015 08:45:39 +0100
Subject: [PATCH 110/139] [SPARK-10876] Display total uptime for completed
 applications
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: Jean-Baptiste Onofré <jbonofre@apache.org>

Closes #9059 from jbonofre/SPARK-10876.
---
 .../org/apache/spark/ui/jobs/AllJobsPage.scala | 18 +++++++++++-------
 .../spark/ui/jobs/JobProgressListener.scala    |  7 ++++++-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
index 041cd55ea483b..d467dd9e1f29d 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/AllJobsPage.scala
@@ -265,6 +265,7 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
     val listener = parent.jobProgresslistener
     listener.synchronized {
       val startTime = listener.startTime
+      val endTime = listener.endTime
       val activeJobs = listener.activeJobs.values.toSeq
       val completedJobs = listener.completedJobs.reverse.toSeq
       val failedJobs = listener.failedJobs.reverse.toSeq
@@ -289,13 +290,16 @@ private[ui] class AllJobsPage(parent: JobsTab) extends WebUIPage("") {
       val summary: NodeSeq =
         <div>
           <ul class="unstyled">
-            {if (parent.sc.isDefined) {
-              // Total duration is not meaningful unless the UI is live
-              <li>
-                <strong>Total Uptime: </strong>
-                {UIUtils.formatDuration(System.currentTimeMillis() - startTime)}
-              </li>
-            }}
+            <li>
+              <strong>Total Uptime:</strong>
+              {
+                if (endTime < 0 && parent.sc.isDefined) {
+                  UIUtils.formatDuration(System.currentTimeMillis() - startTime)
+                } else if (endTime > 0) {
+                  UIUtils.formatDuration(endTime - startTime)
+                }
+              }
+            </li>
             <li>
               <strong>Scheduling Mode: </strong>
               {listener.schedulingMode.map(_.toString).getOrElse("Unknown")}
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
index 0c854f04890b6..77d034fa5ba2c 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala
@@ -53,8 +53,9 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
   type PoolName = String
   type ExecutorId = String
 
-  // Applicatin:
+  // Application:
   @volatile var startTime = -1L
+  @volatile var endTime = -1L
 
   // Jobs:
   val activeJobs = new HashMap[JobId, JobUIData]
@@ -536,6 +537,10 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     startTime = appStarted.time
   }
 
+  override def onApplicationEnd(appEnded: SparkListenerApplicationEnd) {
+    endTime = appEnded.time
+  }
+
   /**
    * For testing only. Wait until at least `numExecutors` executors are up, or throw
    * `TimeoutException` if the waiting time elapsed before `numExecutors` executors up.

From 60851bc7bf1fe702cde1d1548efd8016411e1682 Mon Sep 17 00:00:00 2001
From: Jakob Odersky <jodersky@gmail.com>
Date: Tue, 20 Oct 2015 08:54:34 +0100
Subject: [PATCH 111/139] [SPARK-11110][BUILD] Remove transient annotation for
 parameters.

`transient` annotations on class parameters (not case class parameters or vals) causes compilation errors during compilation with Scala 2.11.
I understand that transient *parameters* make no sense, however I don't quite understand why the 2.10 compiler accepted them.

Note: in case it is preferred to keep the annotations in case someone would in the future want to redefine them as vals, it would also be possible to just add `val` after the annotation, e.g. `class Foo(transient x: Int)` becomes `class Foo(transient private val x: Int)`.

I chose to remove the annotation as it also reduces needles clutter, however please feel free to tell me if you prefer the second option and I'll update the PR

Author: Jakob Odersky <jodersky@gmail.com>

Closes #9126 from jodersky/sbt-scala-2.11.
---
 .../main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala | 2 +-
 .../main/scala/org/apache/spark/sql/hive/HiveContext.scala  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
index a2b28c524df9c..e01cf1a29e95b 100644
--- a/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
+++ b/core/src/main/scala/org/apache/spark/rpc/netty/NettyRpcEnv.scala
@@ -308,7 +308,7 @@ private[netty] class NettyRpcEnvFactory extends RpcEnvFactory with Logging {
   }
 }
 
-private[netty] class NettyRpcEndpointRef(@transient conf: SparkConf)
+private[netty] class NettyRpcEndpointRef(@transient private val conf: SparkConf)
   extends RpcEndpointRef(conf) with Serializable with Logging {
 
   @transient @volatile private var nettyEnv: NettyRpcEnv = _
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 4d8a3f728e6b5..38c195bc7db0e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -89,9 +89,9 @@ private[hive] case class CurrentDatabase(ctx: HiveContext)
 class HiveContext private[hive](
     sc: SparkContext,
     cacheManager: CacheManager,
-    @transient listener: SQLListener,
-    @transient execHive: ClientWrapper,
-    @transient metaHive: ClientInterface,
+    listener: SQLListener,
+    @transient private val execHive: ClientWrapper,
+    @transient private val metaHive: ClientInterface,
     isRootContext: Boolean)
   extends SQLContext(sc, cacheManager, listener, isRootContext) with Logging {
   self =>

From 94139557c56cea318d4a4f82a4deaf72198f349a Mon Sep 17 00:00:00 2001
From: Daoyuan Wang <daoyuan.wang@intel.com>
Date: Tue, 20 Oct 2015 09:20:35 -0700
Subject: [PATCH 112/139] [SPARK-10463] [SQL] remove PromotePrecision during
 optimization

PromotePrecision is not necessary after HiveTypeCoercion done.

Jira:
https://issues.apache.org/jira/browse/SPARK-10463

Author: Daoyuan Wang <daoyuan.wang@intel.com>

Closes #8621 from adrian-wang/promoterm.
---
 .../apache/spark/sql/catalyst/optimizer/Optimizer.scala    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 63602eaa8ccd8..6557c7005d1e1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -57,7 +57,7 @@ object DefaultOptimizer extends Optimizer {
       ConstantFolding,
       LikeSimplification,
       BooleanSimplification,
-      RemovePositive,
+      RemoveDispensable,
       SimplifyFilters,
       SimplifyCasts,
       SimplifyCaseConversionExpressions) ::
@@ -784,11 +784,12 @@ object SimplifyCasts extends Rule[LogicalPlan] {
 }
 
 /**
- * Removes [[UnaryPositive]] identify function
+ * Removes nodes that are not necessary.
  */
-object RemovePositive extends Rule[LogicalPlan] {
+object RemoveDispensable extends Rule[LogicalPlan] {
   def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
     case UnaryPositive(child) => child
+    case PromotePrecision(child) => child
   }
 }
 

From e18b571c3374ecbfc0b20a5064cb58d57a2a7d21 Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 20 Oct 2015 10:52:49 -0700
Subject: [PATCH 113/139] [SPARK-10447][SPARK-3842][PYSPARK] upgrade pyspark to
 py4j0.9

Upgrade to Py4j0.9

Author: Holden Karau <holden@pigscanfly.ca>
Author: Holden Karau <holden@us.ibm.com>

Closes #8615 from holdenk/SPARK-10447-upgrade-pyspark-to-py4j0.9.
---
 LICENSE                                       |   2 +-
 bin/pyspark                                   |   2 +-
 bin/pyspark2.cmd                              |   2 +-
 core/pom.xml                                  |   2 +-
 .../apache/spark/api/python/PythonUtils.scala |   2 +-
 python/docs/Makefile                          |   2 +-
 python/lib/py4j-0.8.2.1-src.zip               | Bin 37562 -> 0 bytes
 python/lib/py4j-0.9-src.zip                   | Bin 0 -> 44846 bytes
 python/pyspark/streaming/context.py           |  54 +++---------------
 python/pyspark/streaming/flume.py             |   2 +-
 python/pyspark/streaming/kafka.py             |   2 +-
 python/pyspark/streaming/kinesis.py           |   2 +-
 python/pyspark/streaming/mqtt.py              |   2 +-
 python/pyspark/streaming/tests.py             |  18 ++++--
 sbin/spark-config.sh                          |   2 +-
 .../org/apache/spark/deploy/yarn/Client.scala |   4 +-
 .../spark/deploy/yarn/YarnClusterSuite.scala  |   2 +-
 17 files changed, 34 insertions(+), 66 deletions(-)
 delete mode 100644 python/lib/py4j-0.8.2.1-src.zip
 create mode 100644 python/lib/py4j-0.9-src.zip

diff --git a/LICENSE b/LICENSE
index dca03ab16de29..790476ece15bd 100644
--- a/LICENSE
+++ b/LICENSE
@@ -265,7 +265,7 @@ The text of each license is also included at licenses/LICENSE-[project].txt.
      (New BSD license) Protocol Buffer Java API (org.spark-project.protobuf:protobuf-java:2.4.1-shaded - http://code.google.com/p/protobuf)
      (The BSD License) Fortran to Java ARPACK (net.sourceforge.f2j:arpack_combined_all:0.1 - http://f2j.sourceforge.net)
      (The BSD License) xmlenc Library (xmlenc:xmlenc:0.52 - http://xmlenc.sourceforge.net)
-     (The New BSD License) Py4J (net.sf.py4j:py4j:0.8.2.1 - http://py4j.sourceforge.net/)
+     (The New BSD License) Py4J (net.sf.py4j:py4j:0.9 - http://py4j.sourceforge.net/)
      (Two-clause BSD-style license) JUnit-Interface (com.novocode:junit-interface:0.10 - http://github.com/szeiger/junit-interface/)
      (BSD licence) sbt and sbt-launch-lib.bash
      (BSD 3 Clause) d3.min.js (https://github.com/mbostock/d3/blob/master/LICENSE)
diff --git a/bin/pyspark b/bin/pyspark
index 8f2a3b5a7717b..18012ee4a0b4f 100755
--- a/bin/pyspark
+++ b/bin/pyspark
@@ -65,7 +65,7 @@ export PYSPARK_PYTHON
 
 # Add the PySpark classes to the Python path:
 export PYTHONPATH="$SPARK_HOME/python/:$PYTHONPATH"
-export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"
+export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
 
 # Load the PySpark shell.py script when ./pyspark is used interactively:
 export OLD_PYTHONSTARTUP="$PYTHONSTARTUP"
diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd
index 3c6169983e76b..a97d884f0bf39 100644
--- a/bin/pyspark2.cmd
+++ b/bin/pyspark2.cmd
@@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" (
 )
 
 set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH%
-set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.8.2.1-src.zip;%PYTHONPATH%
+set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.9-src.zip;%PYTHONPATH%
 
 set OLD_PYTHONSTARTUP=%PYTHONSTARTUP%
 set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py
diff --git a/core/pom.xml b/core/pom.xml
index fdcb6a7902bbd..319a50049a82d 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -350,7 +350,7 @@
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
-      <version>0.8.2.1</version>
+      <version>0.9</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
index 31e534f160eeb..292ac4cfc35b9 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonUtils.scala
@@ -32,7 +32,7 @@ private[spark] object PythonUtils {
     val pythonPath = new ArrayBuffer[String]
     for (sparkHome <- sys.env.get("SPARK_HOME")) {
       pythonPath += Seq(sparkHome, "python", "lib", "pyspark.zip").mkString(File.separator)
-      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.8.2.1-src.zip").mkString(File.separator)
+      pythonPath += Seq(sparkHome, "python", "lib", "py4j-0.9-src.zip").mkString(File.separator)
     }
     pythonPath ++= SparkContext.jarOfObject(this)
     pythonPath.mkString(File.pathSeparator)
diff --git a/python/docs/Makefile b/python/docs/Makefile
index 8a1324eecd325..4cec74f057fbe 100644
--- a/python/docs/Makefile
+++ b/python/docs/Makefile
@@ -7,7 +7,7 @@ SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
 
-export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.8.2.1-src.zip)
+export PYTHONPATH=$(realpath ..):$(realpath ../lib/py4j-0.9-src.zip)
 
 # User-friendly check for sphinx-build
 ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
diff --git a/python/lib/py4j-0.8.2.1-src.zip b/python/lib/py4j-0.8.2.1-src.zip
deleted file mode 100644
index 5203b84d9119ea5415114939624411ba7d3daa8a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 37562
zcmaI+V~{R9xHOE8ZQHi(HMZ`tZQHhO8*6Oaw!Ox-zh}R7-XG`GK3`Hv<xf|tQr)Sm
z(@~TG1%m<lpT-sbCHj9B|DOpQ2noo+lg)||paBQubF`}RKjRq~4M_3-F9hlTLiF`5
z?JQmN^%)#I|2H%m5Y&JD|93-#|BE)Zw{<Xd`CqWLsxxufOh~<tXr#|TiOQ<v=`gN{
zfYO&uZvZiF*gnkFI6>>3k#AtH2BfiH<CWR#Jf6{Kw!I(U-oAS%b!xB4ywdZJG1{2p
z(@3+ruymF90uZ1aR#}|4A|_P6TG|jA#rnPiD?zdkncau5f&)k15>X`GLH?61ie-V>
ziuG;V0CDeXPq33i?y^aBwK?GwAdx%52f3mbTgnWq9>RSpPK)CXMx1NOGw5R^s$@WO
z{a0h&$0gWUh5he2sn!_zskHjPvGDvm=vhs$ucLz6U`Pv5S{Yzd2-Ay(W2UWZSiG(i
z%PY&$C+w$lDDZ0Zk06_y@6kW83+W4inEq*lg{0Mdodn(V%>nR+oStwYgIdfv;_>hG
zT+1AH)?M@56v1D&$SmHn9q6#>j@ZT9+9;j<A)j-r^LXyhB>DM|?~_^w6lHZc^2?#l
z8k6M3Pi56CpvF_8cF>%fnM6IY3+GM>Hsj<yCEmq&<k4?+Rlln5jvWhLl<a$^bx+%`
zfJ5w!iA%fEoy*Ro*2%x$FYD_U8to#LZ+B)R{73Lvd-osqq(rwBbEl#Nz|T)%{M$ND
znT6{drnhVY1T79hTvi_1?e3T*$P*nl%3?kl|ETZet!+sC?^Q;yPMOLwNlV&(CPaAU
z_2CqDc}lY@3~|3f6=fiy$QVvdo`HdY^1*@r5B>gk0wMoj0-0Ib8QNHSnL7P19J*Gw
zwO$rN`pMI0UPj!V&{afE5gJgVhUyJiRUldGF>%^(Yr=Ko?8!UycX#9LAbVBT7DKZY
z(4~0%m}X_&=Fbqw5UFBez=ZbU<IfauPCDD)C-N$8)FN6B(i~?xMS*AHI!J>40cS4>
z!Un~WD=j&2R-Q>Mdj;pjVCM#-U|kfUI47Uf9-{IpOw^nflJXd0<IC+o3XdlcR~od^
zrpoPSUTPHzJb}?`1eYF-RiIw03fZ8mv#oXH0M%2I)~+BgM{%Iw0yi6!4VZ~VX&KR+
zj|eKIr>duIVfXpfD#e@`vwA+}lLu%jb<CJMsb^6|iIkS_et=Fn#tul%%rUndk%4$o
z4Uuwf6}mxe3%4Oz&58zV+d=_VuX+riY=HNiR4A2&m|EL(-y+8|mvwTiq-|r6$z>ur
zr|HEUjaTPmpfQ6T3aIJ9YG6=7JF^%qW(hm2j)11B91*Eve3k2zw-|IrTOU%u?XDt8
z_fpYGc9TVOe@sI|=c%~YGxyr_)CnN0I0UAKzE4_XtF>0i^-vD;bsL$II~@A3OW}p~
z_pTZ%j2-12TZq7vDYlFY(Z~?jZ;EH2b+A)CP%Q64K)T7n``#5oxpERv)lzf|Bdy>a
zC6<C$L`7J2BsCPbyl%1MAsk$CGCp#I2bL8(&U@c%X_3fD{{v^s?r*edbz_p=fx5Ff
zl*v@0h8M<7>b38*gRn)EO)^<SyT~gs%1R}Z@u~Ctm$l7a6-;gI%__>c`?PF^_GSBL
zhlS;avik>Ll3eyG;M~e7KG_Y)?{LE$<ZrASyc3R$Yvk6xt;oOTuz*CyRHwdiAHCI?
z?b><gPbba0=DR^Z+7O4P#040#*S-ibol2|Y^}95)UmlbWKrJtsYygR*r8J^~Mav<9
z4%I=cYlsBn8wQTGKXmt`WtK~<IqGczC$Oyh$`G%J1Y<|xf|_zon889-!8JdX{~J<i
zL(P>{vP}=twHv+_Rs~x6YA$T`UudE5`$=?pR)P<!(Y70`Iq!Cxk<GL6wG!8Pbz!TS
z)<V*ZOTHCR%P?;a94}mZZFcCy1&b?;?4udGTKoyygxZeWHNELB-gkxF2}AANgMXnm
zJ|Bf;D8ALyu75}=9dMeZR4rQyW(;5UCVzjJe%#k?Z-$@q`c8+h`-}zK4Yyt|Pwm!b
zZdMWPRtd_kNG9Z<4i^>paFciZ3APz*(0Lq(ss;Ni_a@7GbgVp^L5n>6IYC10%HHw+
zNB8jVZ_<^ZKtMN0|8Kg-`oFrjGITT4H@3I2F*SCvw6}Bq-{h}zX^+c|@O#rIlJ0lc
z?iyRtMTj8WB*byKTbeF`+yzV5IilKn;<0%jV^1LEBli1BS~({97<ELpMW+ZVW)oRL
z74;RFV)hZ!<MWdwcch7=d(-rK#L4&@NuO)iw#6N5?n$LVvIxK{9RTZ{I|7R+rcE^I
z@85l(Md%w+lx-r4TW34Aq)|>Ax!U2;Hi$mIs+a}FWeiH8FLijssH@Kow@EUmi+Puw
zUN@OBGn##5!gb-vEf&3l(-Pj*CTLKv`?y-U*|jTgnsde?-!SicAk&Ueq+7e<^rI#r
zC}9irL}ARloqMEH2)S)C=!i;=IZ*;q!SKT}AvrMIH1|mwHOzz7S9T#YFZ8X_RCtUD
zCYOJ=0>?OON_?|tD5^ZWBv`sL53h_rpGXk6q*6NP0UXt=3i=bs9-Ty)d*Lm^{m=`y
zNp1rj1(ownC8=YKnKSkILzi{@vz`f;aT0ik(|SGstkjSfT{KT8c}xeXVNbr6Xvayb
z0ic*OPpeOPBDj^KlU^7%t6)&-5%;z!JZ^1l0Be`TT%scet6IfBok8n#MlK;#M@=7v
z8syu&=l2f{GW9}W1~HH?K(X>XSd5}*7(1}b2MNSBdXR_q8~GZMNCo_O5tE>pkjW1C
z7GZVMb)rRDVSr|UN&%;XnnM>6J^C`DA9T;L<Q1HN)Mv3Y3&MjUpPwI<*H0K2+YQBp
zfD)#!$G<7pgRYJc>i)Tli8bbcHW)Qz!d!YrE2797Y9?~AVLjqoAcnyuJ)%KYXUZeL
zcB0u3^JHGWDpWeRmt#go5<;fdMk>xT6=5O|<$Pg2x4+^Mh8#c^G<>AoArM_;Z6>Hx
z9`*=(i2{yYro~dxkc^nd<`!WJ7nI$W1qX(XE??p^%)`<__z>UK<eMyyCME*YFs`ul
zMtYle#Fs&c5fR{E{Rt$*s_iyW$~n+6X;NjTr&FeDy!cQ&{DrdC6RlFexJGr}Lq3ga
z$&uw?>(EN8(QT;#*sjjff=O-(Y5aV-MF6WMg^fDZLU)TE%6JM)qxEskgPa{~{C6xi
zsFPn9NOX@Tw!&};RG2()f7aDgFsQXhD!2OrmLf6&p+->yBPhQ`=%3w3K?*>4(LHM4
zy5<+JHztKaZ;;GF7>#rZ8_AZ2?6Uc58(%lj5uFXCh+hEm+ns9|C{Y#Y^bnSJf(P7m
ztjMhua<Y*cqv}OP_qFS6KnUvwJ-b52QM_y=4A_W8E}+Sli@y=BD^45qR&RbJYLZ9w
zm8~d*wh#6Ddasu^ZN4wAx7zMNma9qT1S}r<KqY5WfI4+y;OVj6{!XHwj2Pj#d0Ok$
z?+<kGH?@e{AQgClG2vrmT!(eXNVl3u84J58sk6o)WWOC^tO4w7<AvAo^4n4Wx8eh=
zSBZjW0iP>Z=;*a`pvDC>vOn!JM~Gd78`dwVsKf@yN-v$#5DuTlNG?uX1USqFiLqps
zTfX)V`%RkkTteP$vap#T3Rabs<syv5f768BHts+{t7f&vg~WPTbf$w`CX>|!(BDnv
zS9Io}q9x>B#C6)(g{hWWO>Zsg+g-I?gK<pSoPe}PUv8pf+eo*=`qf&M@S6c;uk+&^
z<_1Ad)_@}X41i6QL0-s6HbLNCtlpg-`dx}gFg`*zJ_WQsFHkvb8%#HH4suUOTcQR8
zBRU?=UtbC`2VpgYkq23W!L7s}V(N;w45%8PU{4~V<%a!{W~>^QOnMhByQN|Y1x&mu
z^lGE5yK~H%-r7N4Nh!vW2u)@u9G8=lFHu>~Jn2#cq6jFfa2P(AwHO8J4sG%Qmhm83
zs@ejdph4{{nPk1O2}~Bg_<o#A^ySA*Eq{JC9^iT%rFVjv;n>{yyNy%68F`N^{O&;M
zJzzX2K_T9w;?|>^#x+PPpjIHVYFGWU)&Q^!vTufsd<Er6Tt)d}u*H2bfMTHwsWxng
zR5P!*jAZv#VhblnJ(dpw9=&k>W+f*v8EDJ764_thoWsC4y0x<ua!ai<ts-oM^@EX-
z1Hzw$_w)0E4f~&sgL5P^I^19IKtZox>yF1c>G5o`)%XdCe%x*XMYrFW&QxOkad_G9
zCw0CFzEUrMAQ8dfxK^-xIHvKRq2ebkGPBOVukgyigQ-{w-?c(WG$ef=V6Ay68>*EZ
z6FvE11FH632F7oRT2l>)krI?+5<`$|EMdJaWYb)w@_%1j=s_KSdN2x0RRe|Km4QBE
zK+h&YBGqb|H>hcu2a^LoPOz8HSH#=l>Ib2z^m<M=VZ9jiQdXewY-%T6MRp>iDcG<<
zmuZ<2vm^6XPy%4w*|C&$L_W65b0L#X@Ox6#@!)5Gc=pw<KJVb;v3>f~XQ@P#=f+5-
zDN)AJd_Aqec<b4fv~(v6``6#bSVjMW7C$CoxmU47r<kMZ5l`z9n+~GAlCkM4s$*pD
zrY(L99JoA@?Kg|m)TsRVU<>vuc20yhCL}k&SIjm`$kB3`w*kyOPS^%^gFzt>J5a1f
zB$bOel}buq7Mn@1{0O6|f1)EM72NSUS*ecqo6&j#0X=`4svb8a*eMn@sUQ~%@&;gz
zLpwrR)0!=&h23B31yWQf<Tuk2o1=Ho*;84id#T=|SF9(}h1v!FrFgq$Z8Br01*&dp
z85>^P+;kyL;l|G6N0{UzXRC)9LDh);2(AxvTIhvzuIsyTAQ0a^0cC!H;9ztdYLME6
zDu6Vmrn2XaxE$L3Q6Tib;t5bHb8(!!i*svPeI}2LI~iHu+EXz7=1>j->8_Yiabn3Z
z09p-da3ru4AXa($=u80XXiUqUtVYeLS%A4%_vr+pD{y?X!5E*yrm}N5U0P{ePL5i6
z$D5W}c0|#EKt`0p!oK+=oLbVo%#|PiIRq`Jz0}u6-FYqs>2(LUsl(Rm%g8xLUJ~s{
z88NV}v}A8k)0Vfj!k;^(-jTV^q7ag7E2RJC5S?4kCIS}Uj_Tm4&H}~)k{xKo<Y~|c
zQaD}<+wPI{+a9UYamLxPlevzCwqEkimfi{BNzYvoPNmDpO9J4_Z#<s>Z#h4*T5baw
z8P9R-Vi8&fxQOnPlEn4st!=?NV#ujYcl_(thIw+B7jS&4n(XG9&$56M&!g9o5<dtm
zd$0d&P&1s_s9}JvT2hZwRD0`6OF>lEdRT#B&Fhi}OHGk(-cpOI>%d{s^U*h;voDt$
ze`AYg!dv2E_&n*Cw1@i*19zrH$X39>A0dJitGV*E!7C#skR#jWn5V_Ds8jOW>0Myx
z{!6gc9yT%u;aWptU1_>f)Uwv}<{89p+k`K5K-gkyYwMj}{Fr>S(5^G7oXySk>uC3L
zwEZPKJM#;(X7As6!3?Lbg0*knq>3+?N`@^W0kcD=y=z50W1--POpEATP}asjCCnzD
z$CZr@?ARKh^C<u3?&9a>!qv+ObZB(A@A?gpiaN3k(pofATI(SzEpfCe<3Jv!Z33Us
zBHh7gn;!b4yFdR-hSc1jbR&1IM1K}dZ=FfHS(Kj4D(yKn!V|1qS@%QMAq1?Le6w0M
z?$&t9E3Uk{SASI{_<JjDnje+uFs@I<PNv2$vh5_V)1m2T=$#%Fu!{@8{h1@760JD|
z7@h|=anWpK(s}gP7q4_bfMxc$wEk<1Y~>fS6=bMK?AaplA?tkfd~TTKif@GYiEW*H
zHIGN_RL<;!PbhO#_9V7QR=)f?6rO%XvM~D>bSp<*xo*URIkXp!S^T*chbUN_W?qrs
zV3_g`y0@!iAG-^~<y{i;(Su&HE?O*;M|gQIeNrugS|)}vKh4+uN{2$zK#nA9UyCq#
zHMF{{S_1vFpru|#0Q^d>Wtb%<g)F#;i+)5E@@Dm1F+F7Ax+KYCY0I4RgV<&1QV&{S
zwwm-=K{Wlu{oFPjUhfwx@=CJzm==Yxb?=3&`8cs|7kCvnUd{;KqSniXpm7#&zc#Al
z1jAi)q|7rIz%WEK7vykfpT|(pwdIp66P<(u{Zy6UELv<|IuN3ay9UiBNDdLzYGfv7
zG99`0579{B8>YU&OkE$#y^WKGe?5+dBXb35+iYDChWSZ*J#Fw@?T93`;f7kDKgp4J
zLb8irq>E=N`_Er*EJ9rjiur#F^S;>!>14vl={0d1X$@Sxgy~*AI77h^i1U%RMbgS2
z0Bc{OYZnwp=xCt7n4$@NmZ(E8T<gFYi;`zG+PH<d0#GVpY_m;C^F!`GZ}IL!0<iB+
zEJri(?(G#pZq|M%C2K4sw(nbTQpk-v84toWwh*cIJvN*)J!MTJ84-<Ju^xN`dU$bj
zjIB7UU+Lhs#=nIBV}o?eXR90FfPj8Q{@-j6+W(ggGB<QFbvN|<-z<=?wfm8{)0z7>
zObx{;(v$Kwb{!{T?)<A2SscChQ-o;jk?g+!q>U7VL?HNpMJ9pIJ(YOBVqxW@hGd%{
z6Mm9;UELjBU0r~#?lTF`+aXaF+7ewKZ;YL|J;D$2-qrQ(Z9e+aijJr&vZ*Kiq>HkI
zf~GGadYfRRl<vS5Rcx;e;nJC|+G@26L2haCCmrTO4IPG3r+dfW+vx4z&Alz}^fXN`
zUQB@MUb$=Y_|46w1&czpw_RZ`=7Rg}Y*CfG%!WmlM70miq?K}t5o4Lz+!D9}K0fAq
z@%SZNRu+bm-m(%k#fTB{12xGWObJc9oX`;lEz}#=9Ko>94me)COya5$4)8K6bL7u<
zbg$3*L;lMP!V>yZ6<>@V=(;yEX0)qI(rzT*3=Lij|AkM|wAynOniPs@v?(xD%>$gc
zCfOt0n3LXf(NinUziKTq9iw<_`KWiiFibC%$|{xnN9n5x!pFlK1g}~@40LtvzvUyM
zfQrj!6h#_9p)`@?cXb9|L{>GRddpN(R<snii!P1iRxG_d{g198J#OW#p<;rv>H%`k
znV7mga2d@H*<!fz^b4Ai{`YeDg6^Q`6o_foUZkE&zM7?Oy8cViRx=7f*QiZ4S2L~r
z@k&tioWm`5b{O}0f(Buy0v3x8@Bw?dMJmgk6#kV$_8^vYb$v|?ue$JVCGg8JQ|_!Z
z>J;($;Qs2s7-UXkmZ)Lk3Rld1i5#1%@(!O*9<Z@B1uXA0hh7{A+S*kPv={k7{{?;f
zg539QCzvY_V8#VeCzhml{S+LKmb?j2&0l=>bGN^Jp2_AMsy_aB+6z;>tWN)ZdiACp
z;*{XDmzY?Xm{_^{z|nKp^L<DBPQUZ7<N3He8Cii6@O@7g{C%JMcM@N0&p4YrRUPgB
zy})pv%HZhp^m2V!d~}<9d9cE3@pks+;lV8}UT!`VjZ62F-ZMqm_rcKd^Yg^Q-Rtq`
z=E29)!$nCKM98D#=i(ZSUm=R$cUvR2axRw3{!?D|n?k$r4z#l~WLe0(KN0Kir9&2<
z>2Wc!As<kGOgC`BSL%-Lm|@D}82~^*HyvhEq#q&6HmyIiUt3sLB_yp5#sLG{mw*zS
zTneoKf?#3$a%h*GazU<qK@8Lj{Ru_NyJ?!>Z&0BaO3D6C9xk5y&-TLwDn*O^{@p8%
zdr^1gxb2&%>Wr??=3gkOAnejod+IyO`nM7>-<RZ<G{FZd4>R=l?uPP)k|uT<{R}k<
zIN8sg=L@*(_Uh{Fi;P>42UpyGcx~bwJ4wdL_i^*{;}#*Stk4FAo%c7-eV<TCZn0F-
z=^pHh1+#%YwKD<M9O@3hn=YYF`jPQ(1(lwl4fPwQHB9t{B<<I&Qk*)#`gFpWM<2R|
z@?zKzopm@5vK})Rvt$2v(*e&RNiDEeE;)aZ1vt!u5O)$U75gR}96g~4MtgOshYI{R
zWCej0`l=qpjA>w|@>Z54tq$n?l`uoGFjkXe%nqsi?c_v1P&4K8TT_W)k*BrYU~T!f
zTxS1wNn-`Uho(y4=jK<{q~nKqMp_5ygR}mE`<aQDX<oeTRCe93@0aq&U-Sk@>Pg{~
zsiX5?uleLcfF8W1yG1GZ%mk3ev;|3SQ(kY}!QN&7>rG{&WR&K6CH8q;i;9*(l;>lB
z1GZqpvO%Y8Fd|_942bd9?n)5zc^jb+Cqktd@a&wVe6=ziYj1{NOq5B{Fs<*~GyYrb
ze{hU8)rr&nYY^s?6$usrIbQ*3FI0dWmbRx1+P^=#=YVUBfnmts)K6uKknT)E5H5Q`
zHjIX%gaO%*6#~JcDHtVhPIT$0l+cZ)78d@e7LQz@p&JtxtwVLg0#I36?7{K+HBZND
zf%n_1<Nu8B7y6Ik;T?x3z@q#FhY|#~pL6F)C5nYPBAcjRt{4s5NADQ&jwHYABn6W`
zikPWKQCE5_1W{T5Miz#YHygU7*}`VrET)mg_RqZUKoD!;DO+`6Le8a&vI4cc%5TZR
zI@9K^Br`eQ;dq&8WHPYP!dRy)sfU6xZGea-)Gs+#!%|LW<);yQc&(a-vpA_RR*Y+?
zqo`ao&O3wjt8wUX`(BG<6Vw9|qz4}evEeo=eiB$+1)@vPJDxRh@(jD)Grw$P4jur-
zP>szV@RV>MM7u~Yyo`ad@~?V?-i8}IIlS;!Iwjsp1Z`;6Y?)BZ7YFnm1gou0(;=I&
zh|`jzjS6X*V4VjE?LMRze)|XmrFn!&dR{wYKHO9@K-E)rDah55a&Ze`9@mE0>r*j-
zP1zp)-ND>*1{Xf~p2dYzaU5VYA*&4B@)mzQE`IFM7r%rkYxygL*@(RgX;dLgbXQJG
zE~=PYV#dw57IO9j$!^2}eIC_l+e3(lC)5C{vC(FFPxKIM+KTpxkX1!BPCKk?N~w`{
zt1xnX!H_14a<02TeqK>k{-QyPEM~{Tl*5EnQc<bEujpJpQbka(D=yfW^Rv3-90Y3-
zxXC?LIBh&*l9)C9VV48TY?`N7o2I4VbI`CDwIBtx2owv(T(oK~>p+;7bhqj%S`^}{
zyBVR_9pow~;Rs6R!9n*#;+48zX4jfR3;a>&fkxM<0-e-fzOar)D4{HeK21h8+@MoK
zs<Xk0<$5^SjXbU9to5^i_4E<cfIaQNo9vyIl@o-`&x`_bIq_Y_Zw5J>{0Lp&OH#K)
zr=Am=Rd0>+)j{%8iUsA4Mp2aV?T!_piEb|+Zm;0OoTKl@N=EHdjAZ9<ODP1&#n{Sa
z$Mi*Zz6IQuO!arQ7z8-#Nz93Z&l?`5%qG0UktD*>vE$}5op2Ldo09R^VQ_J<!!)Ky
zU+=r67$a+m`o98j3QB4cLx^6jOZK$3YA9^s%yYWR)+~pdWKFrnmj0P!aEPwbTFG8h
zPAFuSM??e<HD#52<N-QDy4@@x5Gp0OJye@n%5Xvndw+F9w^xyd{H~Fu7Iu^Mc$Zho
zc{@Z%fGR+6CfdcyZD-GP0{FrW|AG_K{23|Afib1oB5MHLs!nmeFH++0X`{|4XxUmn
zX4gtdFmP@#VCC|sk?P-0&rasR^KCy|j=-fAJjT7X=kPBT#=U07;LS8pU%;@BzQ=aJ
zUW;8V^}bZrm?%}&Kb&@s#qFI}w_UjfHO{yMHsBDwBEH#I>S{Z(2qGzD-O>n{wrxsM
zto~ey_GM+Zs_Il4$0^B|)S<6O8rl)GWB<jcGU>BS7A?D4DJz+WxnpX$(Z5#kcFEP_
zYFlnjQiN2xInc{QMzC3b2y8=ISy7ePKxqFw(^H!@(4kpHn^;eUWht<-2?Dsv<~bGN
zD{{2xnHpb2r+|Udo-^c1!>sx{iy0HSmU*%l-_V<5RYK7j$PjE#Di{82>o}MLKZnHO
z7hzG`;D2{ZpVmY!qMT2F{xz;Sj;RaU`$Lc%rgigCrKeV@&@eD8@aZs%f^-lV+K9%s
zd0@fE8FWtBl>eA(;ONc)3OZR#aa4eJOq*6_)T{w$;s2Qx;v99+zj26)l<pc<h>sSa
zcJBYLz}NH#+kAuH9BaBzeYrBiK)tAi6{UwVz;EZDi<TaK)8{K>Q~hOpLzfKcRX6EA
zC5gIH@&@gJM4X7CETL}Z_sxN7jA}ReTSXv_S;mkxH45g-r#3xTSlFqfEnNw=VD57Q
zrs%w&*m6NAuenomqh3(Pf5My(VWl>I#gqmn4`b@sS@GojaWNnm^A{fKtbWiqUBJ8G
zLyz!dZErUwsXCePZyqyFKi<^VlRVkOHegZo3ioX$%&+}5$RMTSj^_c{hSPc7p)B|B
zHWBV=s!$bdf*GoBE4DgpLQ+y?f0JRI5+52JINjM<Ji@tMmB=bY|42dgdi=l>A}G?R
zxVlH4yO)48gPuPp9|3teWwV&~ok4PdFJm}zpxjEv_?6~>d2&Al%LX{xzo;B|vGVK0
z7KJeTY^Tdvp&HH24uptW5VE!0DR<Wix6UDR)a!}DFrMqFhm3bbXxZ*0MJRF0=t{&k
zeKwy(o6S-h)kY`!+l04KZC4&)*~oPIg&ERdUEF}b(f{4cm<l%5Vfv-YGrPG5#D_Ws
zAEUg<SCJ<@Lf53uA}{@11@XXV8Q4g$cjy>f35gi<11zVZ{(&GPs8@LW)PtqYM08K{
z6~ZaeMx}oq#>mILHbNB3i{<Vex#uXb>*+16=}J1OArIW-L<oyYWShpkrqOBfaWPD4
zpW^WJSS9d#TDvC((ZeJH%?{h{bn5Vq7N|;e(B;*lN3!@B4hMHjB1;N<)F`33so8Q+
zRV!UYj2fRz+H*mvu#yI<S79E8ty!1Jkk2TaYwts*s~gdn)YcnHEkXRYIjSb)t2am>
zj$X1=Gpsa^0byh@<>vh!%1>$aX4T3#vE*tzGQ!9%R<hXReTXy$`?Av*HE7KVkuOL*
z889rLI2N#p|5CY2xS2TMQM@TE5X}XHN38VBWuBUf{y7LY#&q{5m+=nR0z7a+ARwE^
zo49q6#2;i)R#E${2zHQ42o;}+Z7i$fD2RN~qt6md4;t8sj9V_6A<?q~&-o}#2Np1{
z(4oxTMpeN+M6`^HV+cl=74~d_gZh0xsA5ngf6SWDMlQDi^zqf>tiqec)h~ukW|fq3
zJM)Bib=|Ydm3a@`sEt8eE)R;&^wDz0=z%tqpLhL_#pj9WK0*kr{AL5qPLSh%x|U0A
zTX5ql{Ry2Y?sth!$u=qMxbL9lB3_9Z#Dyw!sy@Q85z0$7*b)4MZjU9du&S^cd(93c
zX4JBvag6XHeG3WLDWj1&Jz2}rNn|!1Sq=(59DiNj*=ynLZx?uTLc;}r!awF8g(FH`
znJT6UQ!yN`k#!XU_M5|tYS42;$?(Yg0%ys7odl<xQGI)h1lL6{(fWlSfi4Zj)<43v
z$8VcXAd2-m(8p(xq!;=w>Jv}U4kjrWywfRT;g|W#!D<omqCYCB2umj9tl#Jm&Njc}
zHZX7>%7nvohS46FN~S#2g=?Tm4SAZotOBG?mh=H^0yADt5qcGy%3)k;vhBoTY)n@5
zQ|o1wdBlbP3Iiobdzkn`Hm{ec_v3ncr-YJ)f6m>z%~Nmf=5cBwlOfC0ddwr2U?|BA
zE%z|uRnM16$96<8SKOSb38*eXZjXi`*nb<e)%b$r6%d8xb0w&F1v_hEfwp17m{S!o
zG1dYg1!T4=`Ihp{SqG-YapqE%rtvqhOYci47}E#Ch6QLRU4dm@gRiKCK6i2UEbpIj
zk9KV3>un1h1Uziwd?LOyU*E+70zkO1yz0@F#Ihdx@viHwH>-!h4i`rL^SR4c8)oK7
zfRqfEOGT*c^kQnsO;qZF#36)fS7?&4$|o_cQVY+NP<#BrdU7@n*!Y$=0J}SPB=hZ+
zacfQNwV!Iy0%2iqwp(PcS$$^G1!ZnQinnVM0B7vKa6vLesWt~U(482eGRaP^+9Ec@
zY{};4Y5JjVfj>z?fT;!~ZEG3t<5FM(WgB|R3HL0H{W0I%Go>baU*4)NWtEjKJYz`K
z6;yk>Tc|NJCvTMCKFEzsafD)b@u{<4#)kIc?>Q%sxreK(qv=1u=a>_4lTqo-mciEI
zRiMsl1)t4){d=t#PqW&)jwX#uyN;%AM6+_Z>0LWd=m@L{qZw=Gz2YBwM{^8g-ErN5
ziNjiocqCp0nQrX?8eoD#u8?i$b^uClQ4oR>7Aq+Lpe&~fDX+{VQBsKT0NWVQzr5cP
zP>+`N4H@^XBY6<`qQ&Z1K~3o~ErrQ)n9g6d&z%fWY0mjB!yjkJFElACgzN6<6Z8+L
zN!%wnKv{lSz_t*N)>;t;bpaq+5eq4~@1Y$|#vU5hk&f1WxBQ!5!Jl=n$O1zb@Z^>U
zcs2UqOLZizn5$!|V#U%You<o0&Uz#3QOtW#6V63~XNngK`^1biFXJYO-;*Uy$;BJr
zo!q-vKRT0h@)E#%US7L|?T%&b#39Ia)#bt^|GTHy<RgNJ=t&Q!aOq*n#U`MD*|x;y
zMdRJe?rH!ZIE9{PEZ7z*vgins1>9Tbqt(S;1B%TIt6<FSZ)|HycvKS`tU|czeQYAN
z?wIYiw!^^1`vMHw7j|t@=<Z-=n<xO)ZHeSQxh2o&)lr?+QgL(FyQ)Nx$SS8QYk?|W
z2h(i`!k-dMdf#YUt2FjH5L<*`Eq8Ms<r2Sg8o}Ir@zOgICUoMKN(8b*i%1Q}d%m@F
zx_&$}{vqydyv=#EW3-NlZlRywm+rJ>pdYeAN!=!1+!E*;<z#Ky%ux|LN#BeL39tOa
zj=lFOlUam@V8)PgV>0S03{myM-kWOmR6?7FaA%ld%r`91@pi!X{?CH#>tLLp>v~8a
zq`Ws5z|Z?O7B`01in3uQqC(Em6ok0|t=lL*_biwplG;5Ymjm2>sI`Z;;QUX^N{jvR
z7sgiSJ(|7Qaf)<`_~chW`prKw=xW!5%Z=B){on?qrNL5AbxLTbWL@%Z(|g%5vOX=_
z75d=&V4xb&&b19w;-;XZCt<f(*Phx(#%(HUWG*E)Un&!|MoYfX6URXj%+q)OobakB
z+O+4zU}AO~#$u8Cnn*wl^11oFvs&7_-zc7SzIPtAb;+_yp|+LF2d7udi7CP%LQs7p
zMF$q@(brzv%rx(QJvFv@gAhkqC|ofqNqppj!7JikwD4wYLrQubrrbDgonCPIpVDJe
zj0XE>#iDH;*JX6?!{6)9vAoe(+wJ&WA#cz75;1)i5C?Y8*{Rd)_RL;kNKL<L9dY^>
zv76>QDbHT<L}lHAl(^{uU#1^8TOJzkOd6ollbd6G*VcAcCA9Z;1gL;dVNhdUtlw)4
zW;<@sB?FC>dCVoywS`)Wjr%VGHT)*<HuURgJEl|hUL3V)z=`hHE$%l&Vr7_IDR>z5
zer`Q*UTPzDfBwq^bPdmkjh*{Z@Y+$f{Xz(^eI(Sr@eq;ZR>}Qm2us`$rIOkoBSZj<
z*WXQzvpf=p?C>IJA*1T(CrZn_as73_0y+Qoj0aU$l@?5qvI#WY;yU@t<>(h0D>Jt`
zM_AsKKS$Q<H8@s<z<fIF8Z&K^qgASNPdfq#@``+m-AAzT8ViJz+EB_|f11z}(>Wxw
zSa!}X&q|~1k>JOO6llX5tZ!Rer0)pV+Rqv2zTx%B|B1+z552134!!p}7LFxg>drK%
z7mFMRvmchh#}qd*D~m^rc#7dQx9Kjh=B_BK8bsQ0eBa_QkOFm_tZ#05u%)+-!iK9b
z+Q&>5QQR?c)%hksU%C|4$>ZjH&*PcxnG3Xg?zPzkHpUE(Dn4b1VluMKR@>r32f)3<
zfOfts+csn0Hhx3Y^mb--ce04(Q(DOHNu*O@c0f0Phkh(@65VUTv^tT4+7_;uOupf5
zrh&SO7W!Dn5a;%o>97YEY~|kPkn5~jos*bMc}+vQA(14ROCd*gbS}r>Eg=EP8X`tz
z{Rlrz>CX@eM7S#I-q&?w|CgH3?&_(31aQ)H4w*VASGO4!w8S<hPY^9@IHQ{{1+Fa`
zRwsX(EJmM2<%FR;WQI4?<NY%PTai<<ejhiz(H!2~gnt<(B-#Uxx*UY;L!VF#I-0V^
z-8Cp-mDqm4t2n#C#b(m^u#B*8P$&TwrkbVsS#@5|>GFIqHT+i>E0u$dMIWRV5xcpk
zU$6<AWs^(Hsq^}^q<*vsO``<>odYYa&5GcoZyJm{8|<fG>Z$mnKCD<i5@oj=aCfI_
z*1|!a^jgs^vFMI&YOcCx+$LxML|c5;fG~`DJY$+X2|qk~!eb8PVZ`d3&n(N1phd8A
z&j$`|{+Ozt4t#|N*Kb~ZAk#Ys4KYt*dF3~QnXm7;vWD}JF`ety%;1w1bkr1*UF*Zl
z)a)b>%(X~e-{pmm`H|1gN*tU#CASX2erHdtamOFcxC<Tw;2X`TdM%2Zdg0k|$e_MR
z>Ca9^U)vTai!s={2wUP7I_~w1?zcmb!8?LDTduz7KLPF1($z`sy-=eSZnR$YE{tL4
zAWiJ&dGVb)7s^Jnxv`f@d59~)AdfF<@Z*o03^R}-wCQLZlz~RIw5>X(58@9-#SQ;J
z(6B9{U`3XdN6xHgpt3P*F1+B97hmi$B=wjBrnHq)lcUvA5$_a$`?XTWBzJTJm-V)!
zCA4qt>-bB5luv+pBuPk_{4KjDQHB%-v%TK?C1nVIYU#IjYKh<F4hib?e0A!REey*z
zZggCl?|l3H7$Uh9(X4_7S?4#D7HSb%1CUzUKW@Wp=CVbVp5Aq0_aQZQw%azS$GT=r
zVzrO{vJn4VyJfl<Dz{<cI1(KS+2${*D^_Q*0R9P^0mHs{p1Hrj`{*Q2oe-(3nBQ=C
zx*m~Y`M7t&dWd|q|7hADKqjk`kSJa{I!1wbe9~vv7H+g{!3!Nh>H=$Uk0t)}P_IZB
z8MBZ7TL--mkYa8)i8r0u(D&!BM8z-7^q^gEcChJH?*R^-#Yxwet%3W4LI^Wl1g?%;
zSdiFt%MxNMwL;9*K#`lT@1cKS2j{yWXr14W8(eq6x=xOcsJc<wp<Da|*zPzuJmqHu
zYy5y*Pm^;|qZt{zDUG~a13IPu)ac=H$LeA9<K@E2%>CJ+Rk++sTN{p`Li#5l3;i}F
zc46h~<+TMv-|6hS%!N^q>w#ZvnB+XoH_rsfNoH~EwaV5?RQ0FyfJ0c9^KDN{^_L90
zN5XJWc<BKzUogHOK__B>^w!d98csG1OLQ_e+bAdhswL%6P+$NeViz~+*R55OlJv+;
z3c)L?z=o8&5t)EU!t0Xu{Cerc*Qa^f-}&d8o2S+m^}{x0u+i6;X)D@SjlDk$%Gf;V
zP(VoV=jSZja6Zk@8F-_GW7hO_YB$t|l7K$Dx>ZUXF<URjf}zwq?R4)8R(2Q);`r_l
zR`MJ*sEnifOSkp=d^fLmdSv;iFwTR-51V-3;-QqTi(^jMcM8k61)F&TDF&9?e}O=f
zW)&$F>fCHo;V|pD9w-@OZJjRuAEKzIqQ(r<mYsNt;Z}{F<mqVr`-G5=jmK%YD&PsH
z9Q0Vui?UWe^h4|6+8R(+67k?8`ISIexVo^@u|JDs%s-8P_g>@VeIPq8gMv*Ik)Wh7
z(#~lZT(}g;h230hpb*hEep;Lz_XPbE`zg_Od8Tf~GVc#+A;+jBd_)~__?6IQ;Z5uU
zhzg6AwfEt9(fy`_*+_32yH@z<H!78H&I*H}t;Zo(-}sFoJt3ApyYY5gmazt*gsdaM
zKRVj$1O_Y{^{9oYj(=9$>)4pu+m0AYQ(DPo6S{pV5dVw!?lY2{Np{^1>FM)Krkcr3
zBL92xtAA^UA?TKD(xJo~!o`tMEStA)^+P!}u{KiFyy9ueLGO*gDFBS)Y!lg(`*HpW
zjj~SG-WKESv)cEl;taE-TZ1m-NGp_;Lffs?h2EM)nx~9NhnxRX3J;Ak;(BDaL~;sd
zeYPvle?qV3RFJogKu)s7F*2_v379KM2EBJEbq5Me#H686a&kBfhJrZ_b?lNh=gJ{A
zRnWX=9x0G5*t5_z(uyglZ0Du$8nCD<NOH6x3oDY*+ON^_MSZ2Kl9>-fjM|Vz#ni-l
zkge!1M6<1|Fg+%E<Ix82fX&`G1#G|9Cbav2oY#DjC$i^3`{<~|d<k14Ftm{{*NTp?
zNWV?S*$C?$7!O5yQ8KaX1Od0?cGFiWT-;*#=d0-xz8+%gv2o>Mi;i(Y9Oa=a>Vp)>
zv`e1JJ4k`BoLm+`M971QuvX{<4W>(47QCZSKRd<X=T}^XQh;HG)$8o+sB9uGCCN`H
z9^=F53pGriirv?1N&nJITQ{z%7nG#Q>OphsGBWq8j&u8F?Ts`nq-&G&!9&;Umnhp6
za@006ELcU-Woc<iykc`VSSzqjK?faC9r4H$%3_O*T>Q&P%APjc2nh-KSF4{-h4}ta
zhg^5w7!p@<vdd-}0Gnj>SX2j6wiuxSHARQc=2zHAVbM*u;f%LTl~x%juHMmzvql%2
z>bf)tYj&ZMuA|IwBTIhsH4zlcYeN%BA(e`*mv`EvmDNxxl{b`Rv9jh~zF}b&A6a6b
zxminRAYiaXIRW&F1a>YiS7SxSE+sYyO>7K8MT6DMWWrgsAAsezm?6!6!}BANESJ$*
zyC1W5S%neZ8}r08iE#+0I7`ESn-zKIgIf=g6QY28U4zxSRi+KF*V#qb_If#?(~diY
z?i)^Zh9*(ke#XbdwT2C){LOXdIj;^Q{*4R!ID?RPiRZ;v-US-%S$6R9^JE2i|Nh#!
zvtsMsB$}}c3aY?0Y0XLaoSpc{dS^{FZ)+zfkyK|o|HOrh-})~_3N+Lk59IJ_#s-0C
zeMed;F90pIHg-9k*0Kd=U-XH^FQu9G1O*BLKZ4M8%P-8}@e@f?HB)ci?uIT~L7Zcx
z^zlZrowk)VK8<hyliE8%(4lEUf{RCe?c>(!O&qHZhva#fX#qxiLAB(4zoTYgGR+R3
zm#9L9wJVSK!&r3G(a$$ERiTjz^p?0tyOb-A_s2C3X}^6g2c(&+>Z%aq0G@}Xo7(5n
z0~tutphlHMWS)qcopvHSW$#+56e=wC-#8kQXA4c}hNILLF{HQrai`?Hor%2Xdqt4i
zx{_(Y_i~bD?Q!*+q8_~4S}|RPS*1Uz({yua8Eukw>1Nk{$u^58LFORZ#j=a6GPd2}
zJODmMG*TOQq^anNqs?Jf(@_`|Br7hrUF4(-99s0526F~>wVLk}n;+wyf=p3@ZANHz
zyzo{d8@|adMSV;Id!*#?blaFzOC@y~XsT?d|N68z?3yX65}xBvNh<0$P*(|PaI+ZD
zR?gunyDf3PKraqKY#4OvjvLG(7Zk<I>t{}nRwQDc_tPdH2E4udK{kjPBAhq8W|$)<
zH>jT~Y@QDKiB%h`nh!I$TYj;RR!gm?7ATJ?`C_cB9lIf-1Y02%)zK(U!%}#``$+Hp
z<-1N6tNz3dT_xI9s?5}57n>&?I*R1FGut~l>t(I>PHTz=&%md852IjA={XKZ;CYm4
zpoMJ9Zr{fpfdHg>(oP*I&R>6y$kE!XCtuxtYt&b9L}6<Sgs0Pb3fR*1bZwi0@mgrx
zOs)`55qv}0hphI$5NDz)0GYS5i;Hmuq4{Pxi7GtCV_kU`P}&WdmO}{bC5uBlL%AkU
z4R0f`#n5w}8M`0HyRDpXzbMu`e47b`=%u3$xu<`GEJC`ty0xuonV3d8qq>a}@~JLj
z)Ztzy>Al*=mySeLrO{`Fw)KV}Ei2*zZS}#Vnjal8TtGV{ZbJOgg&XYDzs{MjDrJ%u
z8}VAA9R3trLk)ImKFsnZYX=YwO>ZZyb+}L9hT3WN^iPpw_kBLIxJF(oT0c6;wWsxH
z*B`T^E0<fRMy1h4`(lj)OeW+hpsafaK^vm>xRB><iuGDGT-r<vTB!uusCAVR3ui-R
zU#k`sGjC}n-C#y6X(FRCGiGj>a`KNEUu_Y8d*+8McnPvoJb~d!Lf#S5bb0-xL@z@v
z%u(Ur#K{>;VqH8D4pAH3ulfgSrb^|#_}t}Z7a_6rW+woG=xXv}FjCf(O2d$$QkP`W
zCA^gNsWIZ@!Ei069mTVdMS4|#L@R|JLhWf?FiN$%vY$-By{Q|LCjK32t2z?iGX)`Q
zMu;gXum5<qj#uO=Q<zb~$o4m?aoCZDaY?BJQD$R^ai4#SA3XV0gl>Z3%}E1CrPx}$
zYC$igCetmr0*{>cVLq1Te2t-z<(WoRMmTv^K?Z}xBtGHvHbGWhIOT<rMCadO9=k%J
z0uAt$wxyVgSXG_<W^+o3mKTJwo5F*ptZqU=XIzO|1*b>v#f;%^pa-3rH5!K_ix2W8
z^gU_L(Pgipm1$Hw9%AeigEk&w+D8@rDFllB@@Nf9=l`1ecy*nWuV}WgKmWk%hR4G{
zH!QQw49}9^9M2Ql@W$&36`*x-t&_gt2|?>q{8Ez7;nPhDFKw9?u);s>kRkzi+t!0z
z;QKaq2CmG+7_Kt`vh7?oZjj<vnN$qVJi>0oXc(x2m>2%Ju$cBqG!lcxK!V|vELZ!?
z6azw+FvTpn+5Sd`tz-&6UEf`wAkd%-sH$r1w&|#IAQ~(=aBl7MPb&NwNirdXDwx+A
zng182RXd3-&?hcT3Q6HFE%f~5gF4_RH<ukggV5{GYuxu=y(AudjF-5QL$k)S3SOqp
zlkRt&P}H7+s#THi_GhqbFOIXDT>qZ58}xGUC{3fSeF)x8Hx2pcx7k;d7{TEvr@Y$7
zsV;S4qB}`=PQM5XFle)f`10!A{d}(9j*Iz!V)Ff{!zEcVA}iyU_5|6lG_6OS&XijH
zunclSDklpPW-$1vym%=h>a1Eh8q~(;_sa(FdRuY6W5ori;@RNk6x>8h(zRJwh6}Ob
ziZr9aY(+UEVydDk?}mO-P(YsBO>cZ4o=2o;)EElQ<c<2?fx5*D$Sa^2bG*OjZbtY3
z`UM7MSdvE!l1Yd|P-wy?D{aX_PnPQ$WYjE!`aSus5#;g_CuAZw5lLO*$V&H_`NHeP
zF-C%(j;bhCp?t|X+l{F9p^^$WcNNWzN`-nnU#s0RGqnGV*^%JF`?38uI^H!?d_?&a
zyTr$HGL+OU%f&pA+{Q(H;jx3o-PWbF0oj}@VN!#b<;L9?uhw;Oz8{Jhbu$kW44Pgs
zde}Xky&%f~1xCp-MBA}D_KdiRr2-XZF($`vQRLx;c1mf<Efr7ZnVBx1N*yNtU9pT#
z><eRx`K!zvF$Y_fT?2G1bLF{;s4UInU~|W2#oODPk!p17f4;#KUrrs|pJy)_UsGLb
zQu^p?s`~SxVv<|BpsJrqmR;3JWwoYu$Q+4DebX?~+{ME9UY#&fk5s{&S!c0NQ3;Ss
zbTq_1q(K`|OVetW!htsL#3oHM$I773ImX#Y^)_srAZG()J2QBPVg>~y508~8*FqnT
z2ib4Qk2i}B1SnH*<SWo!8IU|S_q2k@g3F~yPPL8c#naGb)vi`exQn;emX5GH@?6bJ
z*{4QUudqh@w%-j@YMj}B#6qN;R^%mr90AN@zW&N)#|7bQtO)`WuygWEBAy>Mr(oSx
zL8lWVs`E;%Jj$pPKs?xj(#mTQx}erMyqI*{WGVO9xh1r4(6JBWswK`ZC}&-{L)+aR
zBVmi*982J50Z%hFz*|i^?a5c8DxAE@k)d{5yC<sJnR-(I0{2gNZ~VuLW9L#P$Nqsj
zoL`WaTIYWKZn^qaP`VDBT^&TnYtGoNx>g7ZcdhOjA;fiJ%`x-Ia^WVd#$3ObWo+4e
z^Yah2lVx(jcxm%m6w6*M@#>RbXG+mD{PB5T78}woARsB1Kc(5?4rF<1TI{mXhYgCY
zPMnctyUTQ~SYm!bKwELrt+JD#H?<&<o9%@78~?>#R(n?o6mp{SdA1(%{k=NRmWzoy
z74UMO1KOl(@=HE<B^oj^Id4ej(DT}9b5ZTZVH8Qmf0{<`mvIAjMut&wf61|c^8)o*
z_n63llE0^>`s^Zm7G)v#o4v<uKx-i!+pNd(hasuih}&S-<+54?+lQoJSLHd2AQ;fr
zC~^;LbM!)lVc!|Gep5m~FNzn^e<RsZ_}3TC{OMa`CHExN7fqe`yS+HOrQ_n_!_$jR
zT)a5C+p#@!hb2Py)+5*i@VZLwp-pF_9>yl(Lan_kGrW|H9$+?`24Q&<aswP@(rKG~
zEevp3a5mzT>*N=z^R?1T<*X`v>;K?y>UgY4vTb3M5kzjp9k2Ee3Xp6kIB-7MNFrYe
zUK86cACb#88bYG1hFE`YfZ+n#@&fG%g~E|HHc0apV>pOl){Np~#~ud%G;j}%P^^o$
zH}zP3@i%D**iV%-r#he75L;CQvW%!)zWn#4Cd@UrPB&A@vu%?me^~Xm$CFb@0F829
z-`a<hKE0n{RWb=<deii=*tR2ea|+@_@3j|HfW<5=3lW`d3g?~Tn$Q2-Mw3BqI1&Cu
zY1`TZFwR|aCO)9hnVkn1I&7vR=V=SbE*-rv(kva(<Jj^F-^#AVxhcF2i(utAvcFNg
zJ|MxD+c4;p)3yVctPn{2dRfI+3y{Yq|K>j3$sM|RAQNnTk>vipmhoo^bza#h5N1{w
z8d)nbbtYO#S+jtuxsuE2tpqM=|Bn2l=)(e5zNJoMm?i>mzh{k#D8BS|0?0m#jL&|k
zqFlJ-z$70%GhH?vCvGk;GN<bUbmWNaHe5iDB^L*HIcb>6LvzG#NVsQ%cu_j3J$qt>
zl90SoMC#0utlr7nD%H%Mo`~$_tilrVtTcB%%q*;27Tbr-$G2g18lCEaTM}zH(fkuf
z@R{@dMiQx!>sI>Y$^YCrM(oH9OiW2z&9T^Wy2;eO<POTvel_prI;|Cn<{(=CYiMZ-
zi<c|j$HSxIJ+?)w<43GwF`1JL<44&XZYQ5&ISKxNg#%%yME>X?mMxq1bHW;3MUCv~
z#E!S4lS^qnbZK2cqr0h<-eI`9>u5<u+?o=^C*DkoNb*^W-h;_AS9h2uwi=Tk9{wwL
zu|hp@$cSY?OT{XX`y7V)*$><u8=d<;iXPHt_G$++lEaQjy?ozkYJ@>JVp3|S9>Oxs
zkr&6dwiK%~M|p!&3Kp%gQ{eAp(|uR#6qfPfQ?<lurCH28xz7qMss-9`NI2fJhtJvi
z3E2&5fl#@Nkf?~QUW&BH*w#-;!~mHfZ0$JThk_pgb%c&xOiDb7Wnup-={^o9LH6BV
zJUnn01*}f9o$#^ZfCb@=Vz@24IyGjMqFxq=544+;sB$io;jt0QaSu;qtRUwiB?wSc
z885ABc%30DUzj0wq<iX_0??*3x)%FnftTvXxn+%XHiYLz4bXCT)vC9omnZFQv*fHm
zrBK&1-k|Oh92s$rRxAA6@4?TZgn*f2<2}h&!Wy7KCTh?Gjfl}4BY0!ljC7<@?}1t-
z=}Ef%``nfap`X)}9=Ph9@0BQ!y(}?!UKH&a;luZesZGmFQ4(Dwi|4trntC)`?_GFY
zt<R-DXfX7vnjg7<chHk#ik+)yODDCLMQkbQGrcbJ@-OO-;2pK%6oeK4q_1PbPgU>e
z;kXkJSxLw<%jOfLI7%$6`<z<r-bk%&i)Fj(ZPS%X6;|cSfxFX!$jxbR>^ft<gi^c3
zgSDg3Atk9cuu_@SO&?Fd{ctLo><An#8h$hAO`x*)!PvlV3&{trTBfKx{NNm75~bc>
ze^R6xstgY@9A+AD2@^bnW$9P4&Zb|S<FsW8&k1y+{k|@`jz2z^WJ(Eg^m6p8U8Kdp
zH%E=tl1NdYVgAj08q6Rvp(uEO3k4P^7gys}8Ux)HYHu_BKb)OokY(MvrPH=;+s;bc
zwr$(CjY`|LZQHh;mFoI#ba(XaIOjxk@AZ4{SU=`iW6yVv@vPL}d&}Yoq~FN@#It`}
zlsU`L0syGD|EGBN|Ar&jJK8zh{cWlKi)eN;mX+Nm$3v$tDE)`P%7_(-O<m`m4Vpa;
zsmyA<Ps<{BCVz>^hH7D$z_>)W$j`@gJPN6#<0kD*U_r!~hl42x{dkLwRf~b~WD?DN
zQoQiEBRW%qaV3E%<@k^;0;t9vNF0l9Bx&j&XY3)X2C4N#nwJ%KzZXs9b9H`Zamdd?
z5~JJ(Jv5YkmLzK-<}H&-l|68pnK7z;<7vYYi?>PF(N>9xs&JDmV+n7p(7p;8w9aTA
z#vXEl3{h{m23SZ+J!Yia0?<QB3)*x!0S0K2ZR80J2{3TuP6$IN<t7%|6%DtI2akF!
z&ZCTyA-Mw<btX7N@Ooun8D_#@)FI46M%eMd;cv2nPehdj+eEV#WwS|;g!tlPA+UPT
za0SUkREeP83}rKfdbsiD=B<FHh`rVVU$SxqB(jVUam2I0B1~}Kat_RoBRlghupZjt
znsm`Bna_A2tM^yrV?dqYse#cZ8TAu4d3c{0bFg#?Y0<umrz#oU?CczM)wCObl-oq;
z34z3T5iduHejXNaT5KNE;iPc)9m}+9jz=CO;mu$(V{~xe24h_eV{>tR5p)Zr8>SNp
z2r0t?QeWQ5`URLJ{`Uo~PfxHvAV4-oHskg(`2#`#l4B0^$AuePH?zN7KS?$UKuP%1
zA!Y$X6|Jtd#nrdWq8(*l={H`{6!_X8;%(|gj^zL&;Wv3H$bHLOWsYb7Z_O+DhJSs6
z`-0A<nsncEpL?`+yX{Qdi_VNkEjI4X`1-wHuG`g}%6@d$Uq0(SlhwI6`}rAb@FB9#
z%&8?*9K)F<z3#Y!jP0BqG;ij5u%`i7nYAqA!-mz_Gh=xkj+<Y}n%>G_?6#CN?p&3?
zf7~ojb8CNeS`o*s+Syi1y;iufr+eJB3hv?C7`d!<x;Wqp0&j<8^krYS(lpcU%x;yl
zd-0)t_=5p;tlkLIgSN0@z}7xL_O;N776xo$UlfBfU~lWdXm>XVzkFqF?(oy=9AJ{M
zlS-D`^jmC2a9^RPZ?$+PN!)hp0vMFuNb}k9XJ=X5<7}+h*j3?|+rZv}fv&E#jC6nN
zbzcS!qM`gclZg@va<$uDubeN|jqsls>vXZB*KR8Py|$7yFH`>mVwu^Vv}3J&W&xYM
z1B_}g=oa!xoqtsirM?F~z(3WpHEv8&<`;;(^Zaq+94Y7J4<1aZeK56aV$pM2?gStW
z=OaB~w&;K045sd&>+)3j1H*e)7GZCkJboExYv*<6MT4#|Wn{QJN-A^20A|r0CHR^|
z5jdHv`GvPeK%T*!8@E(ryVY-EV-=!h58Een^QUZ?ir*h#QO&J!a(*-4Z9A*ou3xM|
zo>CC<wd^Li4S0u5{<W&XZ|L@r_%3jBzDV?$VxU{?Mqj(IL!e?or&1gxVeg{5-C=iy
zjqjl3qvkdiJ7oQUt*kN3AgZV{B==I~p(FHzp}VszqLFSW0}nt#iXaHP%}s{i0FcE#
zEQX-U`5^-O#5@=YEi6cSjmp>;jlYE|7wno4&WZigD%{nkpnj5;oFIjq4Nx=?(;ged
z!t@Rij}ccUSQ74IYRKnMw!{`0VU=IgDn?SNhyCgm%R>RyDJr2IkZ1vcJ&ZpPsqpW~
z)0pJ>7nT10f_zAq??eFBmzbb*7NMfZTdD{oyN;4hj+EP6aGS6&=t9=C=a(AzGw_jK
zy}cCzdaGRV5Lg$;{Shqcel;M8n@C$Cn8{S^M9{Db@7V}Z`%Y{MIx9Fua7IUW1dq<+
zah5;QPMaR_a7vjE1qS+(AIUkIBq@dPG>B3%kXDLxUyzww5}|hDxw0u#hLJKsnt}$=
zYOJ}|YWFzkN#eJ;zrq6mK@q2wGduP!BwrXbGcx5v<4lLn0+dCQy5d8*%!>|_wBbpU
z%WqP@2~^>LJK;fGWbuPqSqKqMJI*7VNq`<B*O=HE>CABbuQjANS=f5*-z@tT2Tur|
z?c$b#VUf28mJ0T)gcE`5iz1f-?@^wW5mQvsX9AFtChEvHbZEFlkA^QW*Zfm(^Y4ZN
z_3AFc@bCyh`BNk+M3C?+b6p{_YVn+0T$yz?;y!26fT0910lwivr@=9MKplL6JGp}F
zO#yh!Fd|W8D@JGj!E}#4WZYC|T1dm{f+LVm{9Sp4cO6=L$O+m-HD~L_NddUoMS1)l
zU@ng6;uZr)416JP)0oamj4$pyk43Oiz;10k=94|Wt(*4f`=fNVj_nf$_d9bAfJjpD
zYp%(jm#T^ftY_@#TgrB_`CT<nPN2BFAHsz;5Onp@u5<oqT*@7)4U|zuc?nGM<>td>
zT2uH?tXP^OU)pvuo-2(}PL7}>riKc{{ONVWP_iW6S6@yenw9;L{&MDAIzroZ=SFpu
zE>7E#v)Dva1Z0>WFv1l^=I^ny5q7wt)9XTu4=zMRnOWYfbU$QtwFkbvQz?&9$jS8l
zu_f6{ZSds}(g>^oZvBjaZLSrao~7|by|SslvLo{Uyds5~r@P8u4P8Of@t7^``m=*l
z-djw}p^g+hlhap9qqvb+-8L-K_YR^~UKT@T>j`l_Q4C*(ys7-EKw8<NoL@Q&RrSeC
zBxa2OLDx$FQ&LX<B*p_JB<EA5g_W4&Y15xVr0CO#6K9E5F`xlwt%mm;TF~S;h@_jN
zdv;KeN^-+<3<z9GYQ(miOp^OZ4X1_?R7Jd@yJtj7nmvU@cF+?VcPMcTU{o|u${Xae
z$$qh+u<a?4Q^umrMVR2s#!wV|$_2-OBStej&lZLmO)-sMoKnWn@;2rGktXP3`l&Y>
z;*7>Q!*yiykQXO@G4a-mG%8mH!5}bp$d84?D^v=<i02!RaL$qYwP`wp+M9=LRix58
z|6bBCm}!zz@tapa6E%CY$eO?|i)6pO?5io@j|7YxGc~T_IRs0NK3oDM1`@OxC4`*F
zAcUV?z1z@ISHgY^SKSCJu}Qw>e6*GP#iw&@<TI%g&I6foPh*gh*Z1#}r1WMU6;w3|
z@np|<eN&9OD)x-Z2LgpQfM9!NO@nNxM7l<7A)|2&=(%2}_-Mv{Zni^bhvuGPA=L4(
zbqwY{d{f|5=kSFx#YOznBA!@Y3V+6_PXoZZ1bc#3ACaCQB$UQQd{g?!_gac}7SAs_
zJQ3wu(RngC%9Fv-XY19ezU+pd-ZmD`ZeI=ibTtnsA8jk8Ob;L4cNT{7xWKr}`})Rh
z6P3`npmqi0d{mURbYl2Ww?=zr<37#XUADUAWGu_@$fXpAPlt&5umn9Y5qk9&oJzcf
zhY97Y&!*$_^Xv7>EXn}CRbqKA%<bupqsMXdPGGa>JP}%Cd6n3KA^y#_>Zp$g({iNb
zGf2G_?@>la4b1%H*`pfrjxDs9MYE?O*6B6oZ7<*O+8=`AJu69`4slDHd(Rtv->3)@
z;AkH-9)ShK+cpO7JV2)dj|1ig1EpV9GtuKza*hqe%!=7X<<Oa|hkAs>ob2)f7Gj1<
z3W;j6-Qg?`r92L#tObT9QR=*2n9EDeBng$`HREo}M3`cTX!~{4)Q#CSEkDh|SMdrs
zH~EU77E0bGRTZ=*7z&mSErvJ+u#<YkQoJZmu?&5yBZ%8vCJxQi;u7=7@)5-q0_q<g
z$ZuIpa`Cv9N?VICD7(3&JLj`!D^AaI>vas$nloDgjp2zgp}!k&#@grtX=HxdAp$qu
zpyJh7nAn(RkZb5-ANP;?&syraRJ7zHb20&;Z#jZ-+kf_-`qxli5nU@6N8rPs^TK`I
z=Y7g_RYwVcnauyF>t-<4?7TZ-@ehWRl69m|&!_?q;CG@UJ2)r4ea9ZIg5}-DALH}t
zXMcGAIev_y#|@ACEs8}2Wf>QU*JL!gchFP}Hnqs&b{R7DgYA*e7vMeL9mm)E`9R!o
zT7cfLdi;$)`3VeVYgqyBNfHWQ?<V_Hxi=9Y-1M;#VS@Gt&~Jse&SCi}I^TPgzXzE{
z1+_g1{DJ<fqlL}W+yH+gmH5w~P+aO6loJCbw3$_SOtjZHT@cmCWFweeIVg{^7CEwR
zj-;e?0vV}2V5%QqPyjmc2A4G2T!dQx!URsM{yHh|Iv$h}AZUe{5)(_MeVQM${Lmjq
zsUr*r)kcwnN*x+K<H%Ysraxmx0uK}7EL2T~0@J}I#blzs@u12@!OuNys9<pj9pgAz
z?d^3HB`oxckJ_guqcF3QUMgY8Uaa&ql!yn$4rdkpL2FH~t<p4my%zgW%C>v0*r6za
zG<gPh>^r_w?@k?)iQ2(VPx~I&OG~K0Sv7N0F69$%Lf%?Wh^o9Msj`qqvz!R9LRAb)
zTYbqOeDwwOMapY*KTvRC<78$QNkY=cH@Qv%a4=l-T&V-mC2rU^Y74}Y06w}9B3czp
ze7?e|XmkgQ9IJ{_9m&k-y6o&~n<?(hYCy`LWe*Kv)+#Gp&YdRr4M!+@zZ_##g*|x#
zsdz-`$D8~W@L=Q|#v1)q#YFvBql5&1Yd6uxv#G;qyky^rjY~X!vX!bIq%=;u-BU*f
z-<qLS8f2ICkdi<DE~s#=|50J<gXe)h+Y%~zzGoh`9!xq-Cvi|PIBboJCopR8-`h9(
znoJE$?cxbtS`cCl+In!BZw&(Xa@51IssT=Mk0C}uHE`)R{d1^@FzZKxIlC#-2PgRH
zUr%R14NJ6!j41GAtu%z1A~s1W!gLQFN-#SpzUNV$970Vox#*IWevtOZA^4nsWfak+
z5^^axco6`hZ{X)7HUbRcL@-3U=jwePX}6c%@6bv6wrLvO@6G7p+`S`I;r4M~?Rl`e
zp{6fTQiZfS9UZsy^_keUOK4o`1>qEx3fG)8?Onc2MNR10BsdNIWp~?aGPbi!5lV9?
z6sS%r4`c~BB$0CyOl<TI{XctEduF9rT{r*$#=ra3e-HA5{$EvMXA>u9r~d~6AliTb
z;{O*15bwYLhM|YEiJp_QqlK-R-aohVFSgAg%9FO63<zD9)aoDl?cpJ!WBCkwQdsOF
zh)tlDuZd_wqMGYW6pEy3vCk6kvES-xiN#|##DQ>=%U9-RZAa=;y0gND3)*%khTWcg
zUYE=QrZR*@Q&bf+l9UexEU6$o9EDVZ>oq}p>9x+X!Lc2z$Dm-e5*m8f1-MrNw*GgB
z7qOSbr?#6*02-P?WQ-L;?Lai+DD5IReV?;ikkwD+J(1q1czG14k`z+oU65RE_q+(B
zqXMA|Hq?^Sg>Zoc3VYdYHITiDCfur1=&l1qy`JE3rH`Qm!_pi@E|^@(>x}Dl4@^|U
z2fMVvi+DU}l{PmZy~ZGxqq@)=Kp(9lrVU<A-$p_q<?ek`3`C5YMuJ-f^poc*v!^z3
z*@85Kljjyvzu%+sZ?`0CLlLfEx$ndB@@1|^nd|3gj{YQ1%s0lJ1{>X<KU#8{#Z)^N
zs?hTD*aK2OqY}-6;4zeA%{RX(F0Pm(it&QubqxaR%-iY-PaEUlx*@_pHpGC(c1qD@
z`zxA0WJmF?>(2~;-CbHdXoIf^UR&KG*tqXfet=aB%<1G99Q5iRFps*=%q={afl?`#
z@s5Yw34ElTOWnCc#QlJ?vzfenu=BMQkiOnSzrB4|jIuN0KSt97!-n)ip+PeyHu!5x
zab|)W$rVkR4mNg}1MLXZ{?v?z5cg6So+F^&kGBu<He?fjntF8i@DB55`U3o>Dmjzq
zk%JIP<Gr|%E4vMM`@{Z^4)~U^rRjkH02V<0)653vzc0oA%@zE23vpR(+v@K^^n2AK
z><C+<Nnb+&3+R(<`32L_zrq_sX#Bp#l~OI8h~ClbOYXP%ykY$HB4l_e+uQlE9d@;D
zeNu(W0L+-c$!QeXm6c?VNP~0AOFdPrEQQ2qLMOoQdOao2r@%y<Ocp|kA}_gMO`Sq0
zlH-U8j9Lyj9GruUW<d!B+_Yeyt@#+py^UwYp9SRvekCce0B}&P1lh7Wip?kCM7F?}
zhxPtW%7y1=1ewiNz<Ivy2q-=8n&pZ^F8YK)+;6Wl;*o$Fh#ApgaYc(8xT;#*?r>l4
zeR2NgF1J%7?$4mjP(3a|?rhQoXln^$6|#V0RJgQEkm$Ls`%)jQnIIC@)Z*vnuTFlt
zQpl)@MXZh`m(jQD>JdnnJM18aAj)gkoCTdEuZ2z%Whm;JTCHM(@uk6fl8W0{w(qe+
z(RG(_=yg%{tA6(?D&`Ca`YpkTI*qx{HJUZj_tP6tqxn8p04-o6%0Ju0pO?~UZ~<p2
z9fEpf>}pqj?uOsOYnRJFMoxZ@EoitE%Bm#)G`+w6PT=Oj!7BDN#H$9X&ppeBySyfl
zE7O2V!+&O321j{77R|MxE)|xr5WN5D<vk*~#djdbw-F7-CJf$E6EFwjy)S%PAT0%P
zU*-4fXc_yI&+uEHs7jh<5EH&1s`QfV8+%H{o9(Xm)+P@@=!5dTgd>~!+1Yd5$0&`>
zUpSIx<tj0A%eEli><MWkd!O~jX07uU(f~G7yJ%@V<ufXhe7qCHPt*ccTfm^*nAu()
ztI{e7&<qr;Fj?>_*mYkSjpgp^rVcvJrV_-}M}5Yx@qUzX6OJ|e_t$Ge#^mC5H_sry
z<&iRfkfWO=f@{-&RlqF_<kGhW;MZp&j=89o>A1UT>9Imp|HBQF{s34fd+8|N?cWox
z5eT)s$IIjPeB>|1ci))!Ec&;5-hiVf?wf$fviRw4c$!{;Yy)O|ExI@I-ZKhsRA75n
zI9{_HS2?Cb#V?10z7q11s((9Zp?a{%cq&t^gEKWmhkk(n$gAPwg9P+1xDN#JKgkRC
zzsu{t@O=i3js_n8PG2!9o3<PL2tKcBRoUR!<IQa(9^t{tK(=(ngix|6Q2EPXU<c+t
z=QmgryKXD&(L&~5K-|BaPNuFjYE3jo<N=iGNF0QH83Z~`;m%vQZ3f>a6ra(es2!(+
zJ%Qo&W24uwh|t6b2c}ZOg#^#3RLTJ)B@oE!<w;@!MY=#HfI$Nt&JkXyjB=>*I0wd<
z2a_ua5buGCU<zC^zSG5`4Jj8_j~EI95u$x)i}_QN$8SEjeax^$SJxI(uO%0}qPLIo
zu@ucFfO(elCC6(Yy*&BeIP>Cwe^wZxNvpz@Ibfmu)YI+rkcPm5w>y|1etA{j1UlHx
zN>AHx4ShC<-<`(KjXORWnOZ{0o_oIM3MAg?_bVfaopiZ0@kk+QD$}rLC#0pJGiAny
zMacEY<k<4$z9Bz++47KmSBv50k_p*yk-F_R6yM7D2zxEU@cH1u@!BOLk{0RRgv5xf
z=KUG<v^&;k_cA-B20RN7Cn@vj&qy8`E#ElSZTDm~MuHmF(8|j})&kNqxU0&Bj$5@t
zC=>b-h-#xS%pSh7Ha4w{a7i$S!QOMtV0x&@wfl9j=IAk5Kq^*&#Hr$rG{CfBsj^vU
z((Z{v9;z46%&jvHh}3zl%OWnR_Hr3IbYuw!m5wpIS%x#555iIf5d^V#bvRVxEw!w8
z>PJ_mDpe7HK#`|VWiHRIeHtnrfh%)Bpy)}Ar>}+4tQu|6180hcgB9*He1DRHHFiVz
zK&L39UIIW1BIwbpG;*CI#3AgeTh=b&vgZN2W`pX3FQVD4JL?E_-FWi}axMaRIm7jc
zWs(y4r5?>#IZsw_KYa`d$#woiqK%h^(4T7RK%?B#doj&-dVca){vbQuaCr6pY9fVv
z%s!?-ov&v84QZZ!?t|R%=BUTSJbtSFMd-pg8AnM3PnGP0Z>$n{EeWz?U2Qt-ceLWd
z2{OIPvmhp_DZ$0MFfi(ZTDpghzu)+xy}1FQ>JBMJv!rI<D<>@}geIJv`Oc=CtAjh7
z^o;(Gv;-DW+W`Jb>k0fnNsHiLNXy8;+S<^-$m-vz>smug4x1Cj_ivTyE<PeNHc`Gs
zu`_`!IC)Y~fW$^afyL1@TR*+tj2Dlf;?M5unp4E7Ac1x9jisA1YlIJ7ZrrSkIa^jt
zK7l$CyhA8hK0mk3(xo$Z%!nOF>Z*2^P^bEs)j^Vez{_1WgLI)NGKN_o_?UxfeKe>8
zj&M*sbcCU(&JE&dX(Ggxm`!jg;;WHFc5p$0Jy<=x1x1@wL3o45h;M2Y%R;UKkBkKp
zod>KvZ;)>RI+LRWa3T`aRU)OU9rJ*^JQ?o89$PAZF^NJbxpr53y$)Wmx+x7%BPXM)
z-5sv$iGH<L*O+;%z73y-Wnm*|dUQV`9-3M92Uw*d3MvSPB_-c8zFll~VgG4>6I(h?
zlR~iBP(+Q<`MuYQISUVyP(pB&Kky#GlR;8AH&N9}Cs?4F1=;3skDZP!l}eUP)B6VO
zej$n8W0{#!kd%q&3&lGVgivmWy5bC5ttR!-t8d-+B7(N4W5PkYe~2pZnlG4t)Sf}<
zCiDuJ(^y}C*`5ex=4kvLgR|l`W+tt(gG0cJ?EL5=<R5#_9=^=HamqhmcW)RnaH3~d
zpUxauar67&UrtO|a{ityD^G`VgZt)9rJx~Yxgx=!lBdEims4|+;Wpi7q`e>kZN?Pd
zP)@gI0XJW`OPhSq&o9VjI{0FUe*hxlV0hvxFdE*TLfe=?t74DuhZdJjg+Dd0=h-?e
z3OB#SX2d1S@VJCp%XY;qJ=!y~wcby&A%*~#r%qq7L%vWJUN&{@N*`T2H>ceo`Mq92
zs&`~#3y`t_#WRQqq(${>CQW_b*lHbxuAFYFt-udIDVZX5^Rw{5EiuFr!8@VH`qphx
zwF)$mi`Po-w)+M5LIUI?Mds6$`tm4P4KDE4nGb;~2(5`GU>rmMHN#<hWYf(^mulr(
zRosjgkBfh`ktCZJ@%3ekBzRmmS`Jg(dgi-8i?T?2<1vFBQ<l$!s^EqRuDfI}6$vJB
zcFeUXK(H<{*bG>stEav#(`zvU(J09#2#$jQJu7VOA*h=Z9XuWVnK&GRg@w5oDeI^%
z?kKRJeyx=`3Aobk#vNHGsD@fb0-H0VV$q%II<A$v&?(2^ue3A`cgPqoACR9#v3A#5
zqwItlKqOKzy=F$!0M#;Uuei00$sjJns_`VGwraI*W*SP6bZS2~5mjOG^Ykk2nYcSR
zI%qa4p)0)|yJ%8q1(u_Jrt(RMZE)=A-R_{$b|R8YA`z65P^3Sx_yMhA&u(DlY_9H)
z78XCQP^F64ZE*ySbMdy^R8GQpy8fk?p_v69c|`p&5OT{(xG1dX-&mq&o9YS2x=%<^
zSrKt}(6>}HiQ!z&dK|ry)(A6wlqm}FZQ+1DT^gmDaQLP6QvLX)_3E_<b_s@Y0H@!i
zo4%DujIANlUPISuF<tR^zC|YN3&rfWGx7B9U1RC0=NG^@68h^<y;h^+Ys)&t!Un-v
z`X=1YgvP6ITkb=J>&gQ>p2Hg<MIn}S$T0qI>e8U_goOJ660+Ksvbf@xxk%6l4&r+F
z5_>5zHDX1F6KiVmwX$Bj%cL}j`c?mwk}OC_Kv@k^u!j5cp;T4#s6>&{2`_r<*>@%b
z2=Z<+Q@Y=6d6y^-nNC2W!F0=favMi1!~ru4W)7zwSXh$a1Qt2~?R-h<WF)^OJoc6}
zO*VqPq{}V-xDS^mMqP>!P%ffPaS++M{*zg>PbCiBwR9t1U}C&7kk)J}AmGw<HjOMA
zw%+=OX(xii$aw05S2I)t@+o@@5I~V(E(zJxdjlrs7Mv$^`&>xfG8HP!@5c#5)p#c-
z^fj&$C2}0A(nggN4q3yaCx_g3`d!ohxsSnyAUne9)0{uQ4yN>q0E-1F4gdwja^<dm
zsynzur9rz{4fhnp)a-by@hiJ}!$}~o+O=*syEREV`rXN(o~SQUvW??Lm2zPgpK?Gn
z*KMe#8_FwMMZ3$!(hV#<`!5{Iyw4<IBaa{hVi&O;!F1%WDgf)flQm0oun*~QN*TWB
z$~n$cw|~hg$8|$9ueCbk2Li#)qg0$A!-Hu})TtjORY#N)4e8{}#V={$>vpF$6sMSH
zChKowp^btNQM{IMKS7#0yzO4rc>e6XOho^F;7xVDUvg>yY#`%@Y2_ScAxcmQ-{q;=
zj9ZN*K_L@Pua}=%#*7z~@2h{(M=J^QFBixyxY^_8gLmlPn@m%dBcm96bFu4>?$?a_
z%d=H#st~63qwW^3RBx`?z&xPmR--oeqg?2@(dAWLXi#RNm?^!K^I7-$31ED`Rt?h~
zqYY}9U(9#b?{+_$O1{OKcoa#2$jwEAX4A=VZ;4tawQ(g83l^y-5Z5(={6S1MMr|%P
z>0e_ZMW7X4Mg%)i@Jk0~qhkNQkElWth@S!c)})mpL`lZ6TCD=6_oB3VVU7V)xs}F*
zThdei@M${d04$kuU_&-&XD(CGe}n$Vp|`p|e1-qI^8)UFa%lX2;n4pR+V*b^n&;AW
zb7cPK9R>Rwyf0Z0RK5ZCcVP|{6Y69$U9NcNIirt#>YidSED#NVr0GuC>^%C%-*j-_
zfn<1MN=fs9_>rM4H#0Z0*C5^cheI4k|I)Kv+}Owb`VO`Axq0RaCoE>ndNRl_k7*pk
zo7eE>-XEpXN41W~Qvr8jGM4DmDWfP>vGkDsz_k}^)LKE)%^@9{AO<q(T~M&8m#y@W
zr6|u}*@8Z;;VCo(+yG9K*v>rNz!y#L>#^F_Sbxo`t~H&|fWZQYVA;G{5h);CVIVH3
zY<e15f7zL<$I1ROu~4;Azs9h_tp0HkxdD>&c}t>lXsCOfB?c9a7<VBcyiRB`K%t8|
zNS7WG|H-*$qbwg6t_-4sbP)-^w9BB`P-L1^7F33!Z`w;l_RK0H(ZTo{FOo0w=Wz75
zR;Y)m=kn9<k%2C2Augl8u@lfw=I+85+O?;g8szBKxtA#Oi+g%>YeoZuxUGh@oc5`#
z$T(;)vas1hxPJTy$BXu<IDrsM1~Z~?=s>lk*{O(nrIt%E#%EkpaApx2c{dym0n0ah
z?%y-^_KybkMtXYN(PNDDa35d3W-J)7OK6o&UvWXLFPlm5<S)<TXRs{b34j~f*tx)~
zt4Z`q>`&)`#k&F`c)oM}Dmw5u&70Tvx)!1Lr%(8iEn&akABUcs>Bd-0n1LR`^p(u;
z))1j@)yv%CIBZYYLWOwce%gZCxOp2IPigk$bd}9dbnpT42}>Iv%Tq&WA<3&;$7_4P
zC3W=@CMyv+J3HP@7>Ej~A6g&XZpY51{k3<sf5U|eIr@(D7Hj}`G929vzLu5-^;^lB
zLt-Tzih`HupBy4R*7&o$0qb}}W5U?y(mVqB6-SYpH#Tef>Ui_AREtzc1{e0n$)^<I
zV*=mr8@E4chmF|or8$#}<WC;o=5=P{X_mo@r9lWp#A6VL64~QUo)xKu)Yt-I1xDNs
zsxP`sH|b)1+~t-(i*wm_aDHR{W$Qp-yRzsFFYfKECe>$-q141FMv=_IU`j`$NZI?R
zkESwW-tp<@!7#@}aAi*2o1-KkLO4>9fUEQF-3vBsudk2e3DAVfbHb_%e-h*fNwbZ8
zgiovDWq=aiBD4U?h7oi3PlP1{OKvfXK)Wu6G8~PxREtezJr}VN^oSsKn~ZL7!8F;D
zF#usYbHVO&A}FF{f$t|=rl+g^5d09x;qyPhdgzBIEQH3p#f?*F*Gt>)3qh2QwSvG_
z5cE3&gFKe@8P(@~D_ipWD#<^fF(UDPL;gv7?}LenCx8ZzyU~HD+%UY78||qWZVJ-B
zQ=bnRlHD(|Ql-3+n`@V^O{IA(w#tw~kd=F=?yombeq*`-bh@y5S_^D(g*2UYm&xWF
zRK7t5N9cc49~<M1fp{+VBK8{`yGq06&9)TVM;i_r%a>9;BLAcgG)IL0ZZF8`c?|W?
zWr!W!-wm1r=8YXiz*hTpc-As|lOO+(k+W!XZ-AOmgY!K*%UT*bBq&6NE3Y7$-AD)L
z?`=-uOh9Q8yG$iIi+rBIOu?xU8ev;%h>hT9B1yTRhFZjF#E18RdAAnkhoJL#<p@{9
zz$*M@d|?_FcY!w_+~8e9l?N=7769KXb66)3n+<pe4hsD;+GEV{qVwh#$TQ+g;x0~R
z#3#&;FG_wF|B!apd@-Bs!Bxq41?!aVrRL}?oeo<pa{??ckDV~OrpTV6%KAlOSi-3&
zlg}y-2KK9pzloJXhdDjY2}wt5xUpMJvlSFeZHFyf>j8O$c)=N);wtcwUptT!0`U2O
zCh%;4_>T^g0i2Eif|EfGAmG#>ClCk+6rru9HQG=}QHu_G-)z;OnM&qQ)z%S%muT!Q
zo;ma=G?6L}N;{(J%>1$e>d9w7`XR1S?IX$hYGj$`_|0Sh>v1A<O=P7J#V6IE2_IzI
zHY^A~HD{CO8?qguFeIA0nG_3(pd2A>cN8(?p^H>>5ZHzwM)rmgUq_^HEmb{L<PFXI
zlIqS|q3SMa1M@{7DM6$hPYXf7VbwsqOf!rlWy!Key$f9Ps?AubKK5mG`0%Y%ad}y`
z7FnncQ)bY@z8*~eOq?nqGBQxvba>xDST>6CEHaoAkf<?d%q6@3$q3&PN#-dn%p;We
zbLKe0aHOM@$-=8~Tg=eOEm{3@82mcJmkyY3@UWAJv#-3foH*)medGfJKF+r8z*_j`
zq$#f;N2O{<M_ZP}E{bc`pp|c8y8K%%{v+}=bq(R?Xj~%KYe&BA5Yd;X&z&nmsK6RA
zo{n_MQ<;g!V|ACz$&J#IUX)5AMGuT@KVDlsp=!ls1R`RiGE4^c!wS$sVca^p;%%hC
zjSh$#%xhv9A8Ma3e;X>QWPGpC%WMjE1v2uodJ40Vt|jE=G085;Ys6lyqlkb5V$7?s
z^_Ov=@<&{m4$ipL3O%TrWVFW9#)a(CTVvA}=<DxN%C-u$iQMwN3dff~Q!Ql^Ei^Ut
z_T1oB5zA{QLrNTKr=W)>eiC*Qra2C$y$H$$#x%eW=4j~>xW&M4G8m~0t}Yaw4X~JL
z=Ab~Z-6)yuL|!rxSl=G4yBM*-!tZLv=z#g_*~)d8*KiaTU~dF9kY)#tm(hClwgdd;
ziG89*H_VQ(p#y$~g{aCn)~LFl0L97qzx3iT{Ht9{oLv5!6Re7>58y|ik9oZY3K(u#
z#98t54;uiP`i$DFk`5X^M1#N}^$(TUk>4fr?I6PA`BVsrq)`1cB%`B_5`_rVe&De5
zZ<HV+*kK>dy!M60e2RKwgV5^&p@sl$FnH(8!`Hz!*(DS$7r!{?@%_ElfYj)`uBl78
z0eZz)#&s^Ko~xoEJ{3C7p}t%S+ag=4+!j9*xdjp#Ik#3FdA@H(oB}UWqK`_<N^}Qj
zlJ&g*(2h#&*n~16@+HkuJW*dS5v%t4$?sN**D~7&w(Hy|nN9K?HEEuhW@wkgqaw|9
zL=q;(G(?+mpGH+=!;=n>u5{V>8Pu1RQ=76ekqagc($?9>Z0;y`Tx|Uw$jcK((QYU0
zEy--@Mqi54f0j<2`@QwO*kO5p_sf-+UH4~yGECL;OO<uR!#QVY14zKq3jxL|Y!<4?
zE_-jh+YO+EB5DJ=!&=l!sPG+B=b~NR<Haux7E^#%r?j%jOK}cweH<fCoO@X#P^>Zx
zLHS@v_E%6|i0nKnKB?Q+fw->4TIr~3IVgQ`=kZSDVr(39(%Xmmp*{O$1vGdRfwrD}
z?oI>f;~}57y88fPZqp7|D6%^qUI8_Ftb_FLcVH_2g+FmB(06Z`WgTO>xQx{7FxW|D
z>h&aIRJoU|Bv^cElx*{3G31{T7Gzw1-}%%JQM>*^CQqBiQUYE^%pJ(b<|a(A&!|_+
zA@v)#Ew-gPucdig`9-_Bl&qLGDK)zKFX?0;6(?&%wExYK*sJkoOn01?iguoOxZOF0
zntW7zRmv6X4MDMhd^7%Aui9e=^!vJ9kIVW`TcV0a){s$yjM7pNPLl~<7mxe0pTk`R
zFPxsX0wOj8=as<so=7ikC2XEblZVz%0MS=b-m&#_qZf7U1W$fDQY5!x$VgOB8g~W9
ziEqhvS{vJT=vvM?I~<%I3MAz${E(B&->^z%Z``TP$fIqCHvX*TNt+3IKS^S*<Gru5
zYCT(hr&Nu%oy<+poz#%rUet#F6LWkbeK>C<#R>awy6-P~I;HvI;^|{7ehXD!4^}Lk
z0F~hSqvwXFWFrPCI;4mwb4vd%zY^Ic{H=H8RWe<$yf(+=W(|qW-k+4_hf1vy_0qt7
z@X$AY-gHReNV)J!9aIKYrEq^9XkpJTbwq6}VyHkZs<veiyx={mlhH3Zf5Py?Q`oze
z(IAu@KUuN+*~_;NgdIx=awDMjJhVn6Gp9DY^C@Jg@N-t7FJfE6J;8OU&*uuUszISQ
zlb*fl)a5hq58m^kRe__C8(g7`5Oz6=j=2%6e7IHX5fKaQx@y?9TCDnym^E!a*fsF!
zwX!v^t3lB#<=qHB9_3Zit`$!@1GY<dIR!_^W{rNY=LN^@6EI0f_<Y~ID_I7-QNRu!
zU8;q%q4D3W2guYW{CS%~F~nj6JveB2k#d$?DCF5CJS&fJuA+to!SIk~Dgzo+!-+&>
z1mnr5_DDT4SlQ~&;i}~Y!rgn$7vn@Qcy#er#S+fz^up<Kk{nD<7jp5-417h<CCNCc
zZDcx1A=2hEIjW!{nI`5tA49!MBsMBR6r5P?opYS41CRZ^?NYxFT7$KQIcb+gft2d~
zKsl|X97z{7XAQJ|Y4`QzvYlx(S2g9hBx6RON!WlFs*t#i-LPmHb%Y-A4vM)y$G-`L
z*TRgIiF5@=Q(-*C6-^cm4XY8%%2|Yn<4l5Cf4@ZpTVpnazccbBR?qzf(I{JPR84zh
z;zZ`D6q2SD<OUN|#O{zg7di{ZsU<5Mdh-=%EAG~5b?r~kuOC?aLNQp~`Y9icYd(Az
zapW`m)hiw8k{I^XW+~i^u43nF0<GACjWMQIC5rh<sB@8_`DP99SjP1zg^t*Z9t-64
zF_4hUzwKVsX3Xge4p`+9R`_ID5(O!DE{&PvAk&V3h1k+B(rg{`;21KBI%B_O8kS34
zL+a>y0E@Ox3ms16?Iv;ikOJ#HE_8`(r(IT&WaB+cb!VVllg$BvU)u5Gn<5h=3ES?j
zF0`1lyUuZ_*lKeh3${tS%^H?^vHvh>VUpXnB~i+J&cSzo0mRubESq;D>-FA4W~X0|
z`~8p{%3LgeG8L4<PFm*ljAr)N&!6DZcnN5B7wiMmj*kEEO#T+72p`n}pk&2)uA1Y#
zwXaTE!R_xcV3yk=@y$dqZg!0FY_0C=FlMaL=f!d0j)6H83r!}E9?=%1;Zs-h2@s-8
zG$)rpjNc;hx9ofF+vNwD^6kYn*)Jz3$xj)N`66y0%)zzT3?bFv?)tL7@3ur(+f;#-
z1dEO&>Bpo#fx=YTg16xkW~0v;pL7*`C)QS=yAmS<5NR%>WODPb6$7CS$t%ahdz4lg
z+bHD9X%}dM$&+A_qUrREaeBf5=O3$W^W^yYRQG4(V=V(7X%2(#`3%y?K7w~3It-Ty
zNDK+Pq%c(xRkfKgur%X6sP7*P!H?@}q}PT(WAGl+licj!WKCDqyJU4dVUs;1$T_RL
z!r`mexY^Y1swj;B3}m4Zd#+BS*L?;dJv{ha__lwz`O5nczCaf9F8U5svA)9|ZP=s0
z4Nsp>LC<}JjZ;*U%I~3=Ny4^hw73g?KWPqQbK{eGwRmvZ3X6iL>!fdZSm$L!f8Xb}
z=J)-8Q)n8-`b8atbI7DGz6DmjY?-4pR^G(lc!r^+I9tx|KAdFE?@_5Y9!0QZD)2N4
zPShUY0G1;B40&x?s&c@>mxVnP!p>-2<d0+0#(Gd&-;yCaxL!6C7)U|S#vwzxUKSa@
z$635@F-ut4&OFo^&yWy5oO3$$>rO2yrZ!_?rly7^0h^8H4<Y9yz}<oPuP`Pul%#Rj
ztp^n??it55Y;>znvf;ARN7K8T?UcV|q#T(Q^1}oCm-FRgduK<^hK7L$*vHgeYN>#3
z9g_&GWT(Rzi#)og!8y?C2)D6gggZb!1bhSAmizA9(E}eXC3a0uXPvI7`boQA7nJ>f
z2*W|O!NB%cs;)6~E!#`>@C>44Hm*O~P-tSChvPKcM~f|1AI<Za2*MlkrY(k3x6%9B
zWgl&Qh`Cq$Z^o2v+hl9wWdnz&p#~#(V9STX4yY+O(`og-sXxklVimT8W(`n{58B=w
zB|?<98Pake;)3XlUuA%*ue;N34e~UZ?xAjLDQevI;?i#5?mur_r4VkUr;0j`!i)}5
zDUY~^e&->X0#SV_dL~|G@&H`AaCeHdO$-hR3dm5S+5{s97KdrJpZ{1P$@-on3r}{7
zb!>m&KC;9{RhI8k+4PH>9ruNDn=K6*g6+ab+4C9i5P=cXv5fKGsa!c_@H*l9dNOdT
zeXP?OYMMw3AC8Uh+WI1?7R>L4imr6-2YR;OxL9(hJ;j92-19XjOW`UT6GzxT41y7T
zl@b`9?95e(t1s@<TChi<OCm9)On(_VG4_%O2hnHgSD`WZz$FEvapvsIr`%3ytR3hu
zT#qU(V~IS|<U2+S3HS?=a+AVBQLP}OdU^CeFjnzNL5<?SV^8Yg(70B_^)5#&Ixy}U
zg&1zS5BiR_pf5e!c!`B4Rp4mlm;S*2GrQuJa+}dZ0su%f_)oK||0@^qUkA6=7EaFp
zc5tPs<CMjL;`3T7{~O#s3N!lJb)|7<oyFm?K7&lD^Y@K0Pqg6?20AgCFw)OQ%1{_6
z#6gS3R@gI%QIXf|^89iF0#jr`lPnMjWCP$!AEl=M4mWIi=)G3uGkNqF6gaUxIB<g?
z-G;Rm6etaF7hr00v^WaEB+$&rC?H_MDNS7r%_<><zik<yJGvn|(O4v}CZi*A`_Mgb
z7ysX8_ABKlS&aqhFhj@VBkRz5-bxLvpnQ7hQ)^IjPaigZA`o_ePbmCcN2!ia@aEEG
z;>zz~l|XWv^gU2lX3Ncf(#LENKsXlNqcou41V|?t^(v~UyVt#i4xVpE>usFb8CHY=
z(uM&QVe8McW=&dbS^;tMOxh<9_8zGi3U~g14a{bMud9Z9`n=1ER+|-NE#T{$fl|GY
zH=My=HnMaPm={TJ2Wzi7W`Gev-7j6lHh6z}Qkv2oIe_$feL+iC_YR+7yxYE=*)jsw
zS>G-XE}*7n$oEbbpT~{4K+*Bt9oRBv%3h5pzDjfDWXY10@W#LootsoP&UlH8G<Lb`
z8)s}RwV@D}v%x=unL`xYLxgqq#)u(jhfdiP{4bpQkih!@W5I#WwhM>6WP@n5F+%q^
zh~7ZFnq-&?gGx#+7BtR5LMbAq7K6vk^k5S@i&Dfaa&)}?;(#{jT?Is{l}y#TWRHdB
zZcuz&u+E@(3Djf=OOsx6XD58;T4wXjI$4x2I7yu+Y!^0U$+2xkrOeK&F{h?7rpbAo
z5g$~9@X-Q`9*t>a5mV?O)-;DQux0}x6z+>62qC~qm8UIi8|({~jEn+mdq!f_grv=h
zNCwnVwFKp=PJz7Gpz!C73Ow^ce<CA8xy=5Y-&$V|h332wpjGkkEW;#0UH49OyFAGW
zqw}PMT{|=Be9@P1F|yX;u5e)&WEuPL1^aU-mP!ccR8#R5`zL7{N{2U^*q`c_p&W4<
zWGd?%kv1#=bn22LRf7bRi~(GAK!mr0C|e7JOFB<H|L_(f5tNLfd*q1G61G;Zqg$u;
zoqX@vwx844aS!4LIJLcj-OK@r5&{_?2m5~u^pSuvbdrNRM}{uL+U_Z(V!M7>+p^8j
zeOz&YF3li6zDES|AX(Gq%8EV+C53KE&vUG~iB>C>>El6YoaBEj;kEvdSy0EBy|@_a
zon9|P-9#bX+!pWmP#pM{W}aBh)f!S{W|FOzRzC`6;4oJ^l?a*4l+9#2AUN(#t5<NT
z_O@>Lx^~mdG_M{O3_-apD@!LtwWDJy85>QUJ9{Dh5;Or0j+ENNBvSfQ6|8Gt$xx-U
z7HVkfzF1yo=T-Ra4K8DI*&vfc4k!MIf}S&}1hwxWNvQG^ouD8=Z^n(%U86B5$HK;6
zSJtR>gePmp=4gUswCq)RIjmY{Xj($g2fVvtDq5E~<XH&=D>U!_km@@PErpKG?m~3D
zS+KRVgd`U>T}4F3$|_pLhoy>`l!-O9R!3*1Xk>I0J)fs(hNPr4<ZEpSy1A{5_VK2g
zdKkTp#8p!vt;XSKQx^yiQHNy=EjM{)5#SBf!_O}GneM6#N1VMkG^(D`v;toIY!zqX
z@PIKA{;ejPOXcL$xEW5l<XH7&?r$pL<Auxq9gIc(IoZFnV@R`;1MeDYx*{-*q{`uJ
zphD;m2!uzL*B!VIwbNcU?Mu+Mo{0M5%rJ<tC7`w2FIhNle{O$MV}<{#ZIer(uO{9L
zhifSyBeke`#18kP8rt`rcv|2%p~T!mpJ}|)!)mJ8Ss|A%EosyDWGfdv4H=ggai!L9
zW2K-zy;A5&>zq=0W1ND#7(*4<kXsiqH5FOk!ERthDx@Y`(!orTt*HDs)Buu@aVXq2
zPHW1Mm`O^>4R!k-LzuOxElY$@In(K^2hFV=NQdooZaW9<wFii<{`Ah!GTGN98>`pb
z4u+Uadz&z0v>y$0d$cDLcja-QcA#Q&^1fhLFC-&_Sk-IS%R{%57+T0xU!iPj1wO*6
zo^72<GEM-C8)SXD?CIoPmMQJ1Wuby3D0K?d-Jk6Za6Btz6Rw_DHq?Y;b~*L658r%W
z#pBg$fBo$IBWHNw`nHq<9M;zNxZ?~HwEzKDY|jufQK9g84>9yf_S?UA*v=;b^?+|G
zXs$j1?FZ(4>~Z8}G>+r#MH1#X1Vlg!m^aAA05Bih1SBy{(41d06c1KTsq(TzTh(L*
zNl554`W6=+Y}4uo0iFq!Pe=8f0g|m|y$jN}cBJ_CpM<T)*F+-|VZL2Qkv@kLcV+B(
z6v1Do?_w9}*xgrvFQZ{{Ov2r#aKY)N?tM8l6z%Asy{BsueQE~pfg?M#j;VbOJZ%QG
zx(mXsv{t~lvke=J6kfaG%Zh!y|5)m&P#o<$7y!T_<$tnN?0<0_WMg3eZyi;2*RGHQ
z!S}TGZa&OESqO65r<`u#1Zl&zwzZtid09zW<1guHNgto>+C}Ya*Up*%H1oPb^E@XO
zed_f6IKvJM9t_w@RldMH&^=bxB6tTrfe$N=<`S54ITf9pf+%D|QM6Ia3gF=~&<x`6
zZvs5fFnU4yPB5QI&L~HIN>K!Q;0Pf>+)<S(k{YsdVM?+@QQ<CKGUFsm^s7R!>W@-t
z!Wj|OblLNj6eSgQ(!u@q%w~dDAiI4c_`@1TH^5a@pk|_`lIu}@=u5)q^9_)mbg%UN
zG7Aqrk;Dqhq)j=}cf3|=${cGVlw;sI9|<xHn}vfM^eAsV0?%x|7o)+H#`;DIukcH4
zy}#OQ&-{FxO<<*sBz+m05}`U4a>~P2j|+SUx!~T05%M5nM|-=#Bj}$di23t_M-7Am
z`+)B^caMmHM*tlk-KoH1Bp)8uT!T6o0WNi{_v2@4G!Q2v@jMA7EbwgGvP>MVPAy0;
z)7kPKC@7Hegfb&Am2AX$Vf9<z#t5Ilk@#$gogVf;6&UJ|$}f;eMq<SRi}JVV(><{e
zd5TiX-qK!E;f0M730RfYfrR90KpsoS$v0CWXpSgdfJ>h6lLh@)iO%a66J7>qiCao1
ztQ3#CkLPtM&P0+^5;=@?OOiAwO)4w-)$KI(u;cUO9>u=yI<hWu%;Et|k6SNMtOB_W
z9Jf3p&e%$6nSp{L{&X_N$V5Xp7+c3i_SF)%=Jlx6&3hI<(z;aWa?R!oFG<%wN7ZND
z+dig$=BQOBx}1nw{VYS9SX6_|?V&Aw!HL1J`HZ+qcLCUxfl%`nlwMK#h-L7{B9i#i
zzH|n{9g+4syi&j9s#nXNcbUeQ#NwbWu3!|vT&6!$8=Ye?rhi*s6^s{YwzNJQMfWi}
z&vRp2>E>}KpXfJN&aX_2RCa3Fa6cN{3&MP!YO5^J$DEM|N^ZNjUSya#zhEazcshz-
zp<qaMP8W9;?kXD=wf`(#hkVt2MVv4XR11=H=D)0*%ck#i%Yx7_VN5;1VPT`3{DH?=
zqA81=&fbXF1Qmg8;d!&er~BSl|8lLP{d~K|i-k8id#l?#46^M_E1K~&YqOXk>%^CO
zn<Wx6pDAYvCWDbBn+K-9k<Jy*QP3wz;Cbb#Ee3223>UtdHxFpr7?>72!%ps5wph;Q
zb5?;9e#rXuR$2eebe<q6F7sB8`Rr>pN6!8Jf;7I|rbp2J(D{?CsvbZWLaYK<FM)0w
z=1E%g9@i(`$3dRagDMnh14An16n$Nx^ZJ7pBM9>Y^v`30eWp!m=5Mac2m1f@G2vwL
z@BetK>BJqdq4><!wi)7yldxshZThG~meUxUxlW|cCzh9JR?`5ZLpUBRxu(zb-ns}9
z1Vc1G&bd}R1p4^!&G7zmO|)&jj7D(`l8UGF-iP{g>owO4SA%;hmXj5ujK4>$W*P*P
zWQOs``gi^$n?|H2N!GVU#ojJBLLkdj4s&GD6)$B>#8%ce$Ghi5D`Fd;f+vfaaWpWL
zkqEuSCA+Hf=40XTqV#*VQ7I|{**Ug3T0p0GDreW`i+0zcr6eck%_-$6)|$Pz{KSIz
zIfAdT1UOM|ce}oGpI*q7FWS~C?;R>*=g5>YS8>!KwGGH794EB*kp#IZ+%jz9`4<O&
zFR`;<e~5jri*>myJeM6B<`?S&I^)YKW#6@vB_9p=APw?;V3vBx?^|6fj~o+gz4`%r
z(73obhMa-fMTlQAy*t5(!b)%FkH$drHW06RrG@rvL9@A$7AP}B4{&B7KUW4UP=gK$
z6hQ*btaPY4g_upxHz_l|^(U#!AtdoAOaomA+Ey4RqOr!3H)KRT+m`4)+}I8=UUwp4
z=6FnMh3X0S6$RpB&seAe4Zuioa_(dy)3j@s?lOA0NHX?ef0z8o{%Ba16JL+^&Vw^0
zg~~A}v$JU8#Dz!L?}j?>$*B+@4i2sDKbi!lH_bmn#1h4;0y-6Rzxl_}NR@)bkHtT9
z5sl(286oQrPU357798sXR(B^jx&f+G1+zAEf2(Y4%vha$UDHv<Cgo<SzqM-HnVMUs
zJgqETx}I(`YsojFL@VyK=7Ar5bk74wj8TQ1*;8WpO`az@xSV)^2Yl65NLBp#5J907
zt_&vIH#?xT?cx0kw99{cxk*n>mbnCtc-OB@LN@woprEEGjgZuMxUnc<yA<%tU;a%!
zw_pTR3;toy+n(g&CJE-CnM`7VPxb{(2*s?DQZ~{e3jOUt{r%1tm3y%&ctT647B2~m
zv!X~BAoxhqD97m@ISqD`drnFc()z81UxKZOqV~=B5MoU!eFUe)*5$LmJGKtZtp`C=
z=b*0{UE6zjaZ!#ZI1PPc(keDdd`-WN=e3(R%xpgSE-Ck<A#M)?5qnb|)-W!U_1?Si
zaLF6v*P=tNJ5O#nf&3A`fx#bEA3REF^!tGZPj{gzCG#8jAK`+P-(c$h6>j3+TFpO|
z8WaGCzZXZ?r!c_(@%g{9&NeRA&KAz*jwS}i|EAQktY#y(DUR@cqgOA(x0b3H)v6@N
zE7HNzCXa<BEInK<SUH>%Tx?a_+zB_`+pAIgn}agMv7qDc+e_VeX1jQpa~OMWy@)}l
z_(hEcO5_u_SA7~<8#@m@{2C{=#Gga8nN4i@`FozcL8g+YAd4F(l%Xg{dFYFoim2SF
znDYX$<)zPl8LVnXP5`)I<p`amFmx$1fmVAk+U=Xk9<Yk(=&*Canb<N+RTbtg1*&e}
z;dHqWjQHDrrA!8GQlxl{OGl|xr3;<j&}uPoiJ{%BH7s)qtS2D5uvF`G=gU=_rws9d
zlRp{1S%}Yuj@O@hU`~azjp2vX($N2p#?Cw*s`ZcK)7bYRMV2TUyDURp%F@q1WHgp6
zm+XvfY(+$tNQ^DlrCTFRmQ<umt|k4ruAfZSP$XW+l9!<vyZjE_bWY5@&AiU*b^e(1
zKF@rg?|Hu8=XuWOc^EJpxP8*hLm~8<)2RFm7VdD0^+27*RGkre{fQ@{DVUcaC)`AF
zI-eO`NXA<)JZcCVEH5V!3VWB(0imIWoDyTB;zwqY6Z-yV(pgsKB2FP&{Ege%B=SSl
zj?E7BG>a3FMZ%_D8&gdlyv^|yx#{fvpBN^CINk6LZMoRw4`EU2QqPgn1__FV(Viwe
zF7spEQ@+C=24ZpZxf#xmQXHu3);Xw!6ReA)h|W0ss%kG&Guu&Zwk6AP3yYI~_759N
zH?SPHtEXSPV;hXT!H1meC;0M(4hp4d>7ahb(S>GM8fiQ7cz0su>6_uPf^pvSdN#0(
z|DIjYuX@?^F5m8?zG;4gV=Bv~kH$KuurJn93X+U9<s9fQlrOItXMfH(C$c_!FTzQV
zVB@RwN|&{zT(RELSdMPUKx{(=@vH!IYwa~Pg+#UwGLKXsWg3w_zIv49cGTJ}=HfA(
zMU<0(8oieSpDVhtwYNBR{wK*Xvi)#>0{%Y1%No-@Y+v<IFgI6GR*PP4Od%r7f8pWb
z)^cY{7r05qEo_V~+19hlLQ~yABx`)y$y=o_E{!@l?X5~*8lv$0Rc`HCftVe0)rxr=
zTP(wpVDf^At5V`-gqE;Fe^6qqieN#os%_gWX%%kd{3^%lx7=x!2RY}fBb+wbOKS;*
zZ>-B$2@hWq0~>nIu5xwQwj>Tmy|rs6HnNwwV_&!7sd$ZQgTtb($34ayN==#c)_#_M
zoAeGj5>>lISHl#MNOHbMT9s<jT&CWb^{=U7%^cVS(%i0mlzZ*n-}`DWSNg4>H~1pN
z!aeO?CxzFWhW}={k>|tIljSw)&~vv3@wA|$bvtLb98O3C79@q$0lRL}ex}8sIXgZS
zg>%?>$=Wr<(FgDCx`$ltmsC@9qZXoRc1?=$lz0=O-&~3pZpjtRRJVXE_fAh36t59<
zJMnO3(f6}?7^Wl2?j3pU{R~_5)zb*pcm{rsnoQ9*I;pZ&=)WjB6P$6CsVo6Z-WaOv
zTwHombY*?<h>;LO<PzoPr1t&Z)EJ4<(`R{{W&L6+%4ArlNS(PY(&Ry>*5{^C$9Tu@
z>GF(rIbrog#fpv=b-U;LYY6FOV*YchK{T?Y5mB9tGN_9C;1xyKSn;kiOdojfR%i3L
zWJvOJ$r}&i2$yNKL7t5R1J3=(wZ03oaLqsO_J>?1<PH-QMY03scmp)VxlUT0!L`LC
z1XJYd9?1UQ_Up^QPE_{)ynNG=4W@`{c1iNeo6iTH+1ydB#vbZI;E>H4V$@odb2VA9
z@=4Xd+F&9%4-p;r-#r$|-<DOcykxK$Lo{3L_@m)=+<MMd<7*h<Flew32D=VuxoB6E
z_zzT+(jGNsf~oOsRA7HgJ@D51I8hDQX+75?-dHy?Pn19Y?92y8OwhT_D<nx>aug!D
zR-MSm!Z}WL#Yt2nh=wahGD*Iid2?xHPvM8UZEtX2{9`s|LQP15QW2`cc<xTfRpn`-
zP+!_iDn2BmOn&6Z%@@9AFGkw$jd{Fz+{Y{&Zn1Q8IYmie;8Lk99rd=#eLv^F_93o(
zIurXShPAVElDp*SvPk{_vM4`Ya?xcbZPg)3mM@cnJ?Kk$z$0lZKBDY?C=<a{jwWib
z_Xm|eI2qsRRU#BTuef4e^n~PJ$L&$6U-{9!6I-?F60{x1WZa`urhtf*449L)DFya0
zC2d#tNip#P3>Y{2`9O8cHOH>UXtt<M4n}p3{u(o@pZv2m@ppbvSRIl{=Tt4~(48&O
z7r9<!&x(CdM`A8rXKC%BJEi*4sMuWqm-9kk(Hz(AAf$H<UX>0X`#9wRv@MnUSB=G%
zQ?bQ!^8Ev{QtyS&3rdw$bW~pLW&G4>V<(fktUFMjVQ>5{0-v0C{wcECf3SDN`Dym;
zNT0ZX@sN%hrMc5rJa2_5Q7%Wq4Wm=cA_#Ncb{$pba0^1Ar?9*m3T;O9(K4BrvmwiD
zxtoAR&LSDGk?ud*i0iA3F1iK;diwe9;aQ0($I)`qZRGaE6ss}|%!!UC9m-wG<`Zzh
z^i0gS;butz;5PigfkE$sAE09e&S1Ne3I-U^K7ILU&1i3(Y_&mvUY8jbKH7`jI{xW$
z1n6>7l=9chA*$W)N#C*?arp@sDqx2k1{jPJLh}Dga`yAVJAP5@(&lhnS~e;M+FA+%
z2IGcs%7KS^`w#61FSjj88wK7dnID3>4-CL;qqgTNfYrCWU7dq~stNoTB6w{tUI_8=
zelQWd-7@X+fz{Vg<70r@+b$nub+GSAdwT&Js9ZDJ>!yE`zG}CB3~~?zAw{WRAdnyQ
z2(YO3FoYKh<?X5I{=S^wJPoWS6oPQ+v5<1^_{q0nGul+JkV+Urb+&|1_tGWO#)9Ae
z7KLD+S?|W~qzgN}0NP~m+?)u6e8F)ydDpw-w87vJN~nE>f_8(U6MeL)-~kOmNOAqb
zc2gl^owT9g&K4A!8Mzw@85yAs1!sFuPcV&vK=<mk(Z+&v6ETPZ#Nr{?tp|Va#E~`}
z9IeG6aPswkhkr?SXw$)QP5?sZyRn-Nj{mfo;Go6@VX7zo%>>6wFo*$Yy<xC}z!wha
LD)NAz!(jgh`%{su

diff --git a/python/lib/py4j-0.9-src.zip b/python/lib/py4j-0.9-src.zip
new file mode 100644
index 0000000000000000000000000000000000000000..dace2d0fe3b0bd01d24c07e7747aa68980896598
GIT binary patch
literal 44846
zcma&NbC7RMmnB@bZQZiXTej``m2KO$ZR3`0+tw}Hw!i1?nVyb!V!r6kh?8;t$h~q$
zWUP~CuUx4h4GIPW^shER_a^oqFaNs(2f_og_h7YTP*s5j0)-1{R1H+PP<3&K1p)>;
z1qA~7k51v=!j<6ljnV!I|JQ@~pW&`1j!qVKw)FNM{|D$_VE-48p5Ff+r>956OGLp)
z&qV?AFYtdsLH$Q=H}z?rs_;NSApc~E^`GFD2CfErMt0WLCPvQxOycz4%yOk;W4FPH
z@_nt>@6;=yy(-c1v4>=hF=o2e`IyKWz6Fl4B~)U=twtlwrzp2i^W3{Lbt_t|i$@{1
zL8|~NYU60inT^etO<eNZh{4yrpM0HFt`=c!W4;peXDC&!b>k}AWKoqwgZ_|N+T?fW
zO(QMj{-f3*IAh)FXXfAyDRL6Yj1s$KTe46Ci4o?7VJuxsV-e4XHESH1Hs#(eowgx|
z{F=LJ9qEBGlgR;FRwT9_nP~6g8Qw`C<{Zqu?n837_qq|o%kv9&4rouH;yvgy&1_V=
z#E{{%?}0kWQGGGh)V+0op?M3nT6;JN21BZ*Zk;O5V6tfPP{^1EiC1)-Noo}gGF4>k
zK5cVaDBCIHYj!mZ!ZYW=B-vWQXy|?~O@f@r?Bg0lnl~j3DcwLU2KlN|T$9evV?F;k
zhP3YMuY%B9J7Dh8eT>%=o%_PEeu#R!TG;-F?Am3<dY!~&zadx6Ltge{lwSYX0+tzZ
zdu?ZM^6ot=7Mzaz>DOOVE#f<~Lx!{h11Xe+AwU(%%f8XNv?kckCb{U?=}`_9Jv@W!
ze0GMI7kRpf_rDj7`Zh6t=ZHz)7F&*kbJE4scr<`?9`949>PJz^rO}X<5rM%=re95~
zg{<NWy(;V`+*6!hlS~FVw(J2z`|AUnNsY+vk`u*6$30@r>4<i&BJ@X06}9UV>bw!^
z7m!N2g3<tYG>OnuHK}r&a0OFWVaWyZ!9`hD1Pe_0=!0Ah+8&dT391c%sR?HG8%oxS
z)MSYgf)bSUO$4OJEw+Uh3Pc*O7=h2u7c+4xe(+|J)=Nx3-YNer?qzE=5gYIgM#d@e
zCSD`H*pI6ra~~1I#vNrmAkB>!_XLUJ&+2ai0@h(U_|3#8jr;3>8FQ5}_t%wW;hN|&
zw&ybzoRptgBz3XGdLq_S6-po#bV*%Y#jP0!7X(!7D{vNn`ZK3$g27|F7BVh2IrTfM
zk_NMsJt@#sG6->Qzn{#e9M~T5BzcnR49I-%`Ab#TgLPZx6eN(deoR0CJaBJ1CU-0z
zR*0ds<v~!sp_-FsCORkPa+OMQiKrCCMU5N!_si}B3MWWhJ+YAI(wr5s6NwyK(=Z^R
z)(0OSv$YZ@86>$tiuI3#@Z?)5xGqUVau-x-jNb2TU`l8mfv46e_OUm*((2F?D@Z$k
zk<mrP#6I1MW}xKh;=f4&i;?MpP$NzO3YCE(A78kjpb7yw6Wt;YZUrT{9TS3}*RHO?
z@`hVozLKW?TlAe(jW1YNM6qawVE__)PXKX*1c#8g-pCJXz$i2NI7D*<6=u|>))H$E
zYWaH~FqbgWGko8x@SvLdEoX{$`c(Y4c-2KJ&9%a>KzNS`R)Q?s%3H+&5QG=S{11<t
zTF_^YpITeO`!R3^kV6E%khadjv<t?^Q>OJ#Fa#l$28(zc>Z(6$U!T~;KdO<}Le5Y8
zrVcnr+NTsC=5wLok?oq3`@!^PsspuNRZ`&-&!X6(L5)%Kg!A@{0{~R>Ah(-Adg4c|
zXX2OyBJ7ndTCdiZ+^;ELV-jEml8L%X6CgKM5WzC>q5yPIjCHBR`mI(mXucfwLa-*c
z%BFb!+pdI6Ps>NkH#X*Sddx6=fvxp${Bm=J;TQ{Air+h9WyJXt6<Jx>{T@q)wzINc
zXjK+}dpE1$vp1RoTFV7pD{G8L*ZOOSJUEtYBJ5Rh*$i}SJmcUUSRlyDTUN+AU>jp)
zn;*pm_iZG0-K_8kR)FQBTG#gaH;`9?3HmZTn;`7K%tUH;Z74DBVBy13FZ`jwb>b__
zm~&xdf_SL;(7;njjRH)3en)}Rx>OxAX+qEd(4CU1Iv=s-leofgMk<O%yzimoFtU|H
zG!k_)k)2Eqn@**`8KZgkXeyEe1N21|I>CtKR-)x8C$*37Na**iDBJSOC&1N@nTt6z
zza!vgmFq@4sfbcmSKi;}PU-xI6KBX5D%xCFaQVL45{IeMhWBU?Dz^Y_b)P_2*3lgS
zouj^dBEWUvM4CpgLam^reZ{9QW8dDiQ~?59)hH4{CoX~|3)R-RL^1_i&{{%!0Cm;F
zwZ8oKQGrRO(vE5JVB>ryd8uL9VCCPeG?N_MCDNg4d8$BkDfeGRlaodKg$EpFX5uHk
zq9@{JpwFN3@>Xfrdj3#@t-ZFzt=P87ikkw#V8mFBA7+26EABq)Tmw-~&z4+#ZIcQ5
z=EOpxiY60E4lBBl+u4E%(|oA(q3CGmepdS`JiSpVGi-*l^M1scBPdLHz%B+6&p0hP
z4iM$#V<F>=e8FJQUDNZZdA~Ha4vF?Zc{<F5b3mhbQtJfK@cgw@)ek2pAS!Hzp@QDu
z)%6potwq3_DJO69*$6)M_kTSymHrXC+;afbKq78;SP17MTO;E5FF_caHUrrY<{*Wl
zsmD9Mn70OkZKRSn%Mwllaqm9ki{8S|QWjEHnG!=+VD9z*X%!no<8}E4>uh0Byr(|`
z=v#j>)QR|F89^ms`Te1OP=Uk~q@uOfpw;lCc^H;uYrfi2Nb}=`BJj0>szJ7jVvFZ9
z;aod!Gm%yifYwlDC8r}TLuQB5<|Y3HV@^~n5F!qh-Of_DUy-0=l^B`!cquW$AHM&d
z-_nMz^BJ2B!(fT_dUg*2*REm)r!=B&kwGmiCfu&lnt_@R-mV`J*f^|$y~^A^N>{EQ
zw^Z$KjejWdU}>=gCy+2g=r&JNfCyw5g?hcN)0xRe%RauF7Q8>Q;Km`qji1fWO~lK)
zYfrtTe5?Ebts|Xtt~>eG>hQuD{_FZ&63+{qC%^4zt@!#EV@o}RY8u1PlkLy$=nHXY
zgh4ViK%AxPO}9qYK)R5{MgHaeg+-MUp*bJxCPpoB0FfLks9E1SJ4`2L9L-=>rH)7)
zeIC^%DJZE~OAxp_bLuV;eROZpqUt&_|IfhHV)B`k{GUWK7Dfg|GY@a{vfso8#%*?f
zOadJ9FWi|Q6BP1@^j!d1nbnsCuakw1Cv07yb<zvi<IIVaBmJ!V4Nun^j-rvz-&xb7
zThUkR<N^}C2FeI_5xMp14hYe0l1@Gn?BI+*=^(9X-1T}u@=L3!Ao){C-Y2W{TGh60
z%aAL<F>Ha-B{zoM7c1}iS{O3&lYbD)uHN_KuRbcWnQh^$jCR<zq4MhdMMMuphjsZm
zHeekvWYuQc_woVIaQ1Tq4#6%axr8{fa1g&{L4_o%2ZV(mY;Syu20|MF0F<TTTAYGE
zw;a{dgtdG}lVqm9Uvi+RE0Wi&waI$+<Oam<YWg&`<Tm2(ELN;IOW}2%Cks-xJ3h7X
zj#o+O^Qgrq1Q8~i8-99R(o>(z)%O9Q-NgaV+D9B$xLF|_s+QVmov-VKhs(ye6S`MT
z(<KbPEi`_254R}kZG;7IBl3*_7UVmVdn1mo-{Q|@6oc~^B}@tHpdU9UPdl?0geP6!
zR<x1Rh`~P4gB(jJuj}-kqw;jMq2R^AwrTqOV+pQJWv4@bgDA?+0JAwrF=<+IWoiOy
zQ^dSJc*ehdoqWAn|AOPp_m?XO_cDl}@|zS-a%8N>afw!#cm1_Wg4$Od4>7QbzmC+d
zN(6jfKWBIIZY|F@!^>7I;pxdQ4QJ7wcWNlIx4Da#HPMeGQ5+P8d5s|I0~*-*9m*pA
zhL#aM5f6M#OumylxE*a66?JJPw=wxosLo!%Z6&SgvZittVD06@i}6V%_7PKx(l(k{
zoDDwER<B{xVM&ML<4#w`R;<I?vD9Xs7L+w*&5~{=J;}zoploGneibZRr(n>1cG}d~
z%@X?X(t!W)#nUu)J)17^y4CDuO#-H0-QCS7LhX`{lz!{mRs%b6z53X5ZB{6KDLHN3
zI=`{7OB_*;s=J<a>2n0yik}?=-O!yqbu`V1XCIGiFT-5SBcqm|gfqx>ZO}uWK3Qc?
zE7;9AWTkc#Sc#F|khV1~T7tCE#SBq{sged2a?(vNH8F9Hssm8OR?D>ZZQT28rw`s*
zR(nA|nC(_s`Dt`hc1m6ZTHKjTDI<*@wyopa?M6`+?w7jMtk~(&hOp=EJWUDt8Gg^z
zM71AedPtO1`v7aU7xYr^7ysDNpUiF9^jeaaNzeprm)^&QOf_?*D_9&r2JDp;3oWip
zS%+6~-Rb&qc<~vHIx#866?63R(w?kSD^@y2!zN=-9`sZ6fM1U(#Zf_k|BxzS@vgF}
z>D!1M%Q{^Q*R)ICgkr#Dr_D{{)z6Caik|jnUn^M!YOS@^*7CD@o^C-rifVsq1a^{8
zDhknlfd+=#AIwk+QR9l4dB0IsHeAb&4HZ8dCskgp8@%j|j5b4~t~EIqe9#XvNCcAC
zXyiER7&&;WD?u0p?FkO?oDQWWqf>oZHu1%~a@+XX65R@*^fji;J#3H*5mF$<Y5!yf
z{9#c^w0Ctzj`?oGdej)eJfO1kI~L0r>`L4GHkWJL>Px=fbTigs;6bb=NbS5?JGUv%
zfh^-_zvg1KW#d8gX2#JWh}qWJ_HzC>p=YnS_51fSO*q<HY8C_tXdDU%2;o1=G$T73
zdjseHrZ{U>osL~(MCv)A5?hznN`bOrDE~{K2E|E49N$Ka#x-tiC2>`CbxF=J0()y1
zef7y+(itU!P|xx(9cR|#UPF|`h3UpqE_-(_ixg^bt42U$>OLWDt3_#<fK49~Dnwll
z&cJybzaPMBk5X(-fDG<!J1q#)7K^J-5@xeGBv@y}Cb^7YD{Z|S^!_D{XYApu2tyVZ
zPbz9-86pYtsUc}%4Kx9T8W|Ir;t5K-+)xEiPJj%Cp9l*uFnyQbvb<TuLu2+T-9A#M
zAnF_l+=o=joJq1*9J8zK9fY@b1D;DV0!Di(FY-PjdsdWyh8Z)e)dE+p>WEf{#SMsw
z$Rz8E$SD=3Ccj2{o_mGY35b80x{#8j*!VChmB<jYL32AFAB9^=NkjUwy)I%KOE$b}
zewz1-E`?myTrf3|xRa2!V_?3Io?Di6i<Dr1?Pq&hW=t>;>86-#+U;<p-O#P`lh5v7
z)VM~I7st&{N>?)_#Uz(XJ61X(<pbZCn(<#m7KhTyfo+TJ(zC3Zhu(~}JX=|t1T9|V
z3+g6YCC8!sBteM#Y=$d&b<J2@s%CLj>Yj;r)bAe~5Ny)V<ukzVmakwBSZ{<2=2^8U
zYmi0AnBhcJ{}~d8^d|n5cYn(doynJ!CD#Rv=8JQgCW=GlRng|>Fz4{*+kf;ym39U0
zqo0W_Rob~(0dO;~A}yfC$_=SZxWa$tPCP@zL2cqq%1Ac3q0Q~YnP&f9$^S9@%<())
zHf}8eYfQ9&_vL%9KzS>Hm2tA+``wL^4Y;&&&`!NqM0{|BAcg^*WUhicgo`Wwc^gB6
zXC?{UCBF`ewvj}slt`2?(zMo>HMbn}yt*vSIsd%cOG4w6h_MtOgFD6_N~tQGgRRKm
zg~3fb5Vzi;5`FMFE-Va!0|BU)7g`LTDwV0I-^IUK%=GKac;6PB09X0^COzlqC;WF>
z=;d}<$0Gs(but10A^j&U>>cf#?fy|O{ySpeSpB0~+;{qd%YO`Sw+Sskbi(eq0*@*x
zyWww`@%ZFxk2T3`SUxd6N}SWZ@7RSFQz|7c+B#sO$%@QGu?%`hw%BM-^8N~A9Y^Jd
zmQ1#ej8A~yfMA-Gn0X1L91HStFTM+9sGlkeBic?NF~V)&(`r08q7fac_cx74eifIT
z;X9FELEA%1@|JhnG}*A(g`}MsXO|q09i}bWOA<~XNQ~A*o_rY(ePx9UP>rjK%HC{z
zln`JJc||((LRoMgmCzBfN;NHRH01;Dr;UG<FrQ$>?jYJwMN-Z$EIwm2J9!*>=vZ?d
z2b~Vj8`7*ZpbEzy5`t%)Y)4W<I-!lRkXWI8enFPyFQ9Jlx$k4Yx8|ZiX^`PJM$-1j
zG9g`0q<zhTc7Z^4lBlSa1zsS}-bsCL&li(SKg1vr&4*1qqPm@Y9bYr^lRv<}y^?Sl
zW;Nv94@1+4>b<eRnR6MV^`7z!KI#eb#43$G_RknHhN+^b_c}a#8*VpwvH&h#;A=0W
zl0a!Vm8hXg4}-`s2^zItBqhmhrMo9F$rTSgjW6b++V7KRpMnj-gNLp<<(3{GG0XL@
z10&bkPn$%cnG}R)-EzOsqZiB*$TzF9!K#r|afcFsCcAESPtFQEI1(R~F9g*R0CJ`U
z;Wrrgf?R{ydXiE!xJzfrEPZ@!8Wp_5_V?6}{+ovfLx><gC-+&ksb=m05<8#d=P2@u
zr~{MRnQGFF(?#Ik8q~UN-atA#z8E#B$KdDnJb6^>XvONmlo_3Rqd!Y9Z2uMbX&zM_
z;l-j6tJ?`NmBc-#A2^B1T_2FuiG^TV!{N=9esHem*S{}66yx;hjke@SG5N-0#}4k$
zb+%07*7hj23YE37V$Yq?H(<_vXZmn9mBX-;!PsL%mrAsV^t^>x)Y|s!miOTP^uE&J
zst0#HB8~W+3UwyFp^*b;3(yG@5RPIzNuN9jFj-mdEa|*Wy)pGBoD&du8Of<E>1!ET
z9FOrN*jQP1=5~cdZEk`vUF@u^uT*0n^X2Tok6vD2%QfIzpfqU#FLOA#Hr6mGZDC8F
zg}hk;f$FqRJ9j|bkH%PbGS_Ym%J3|1VQa@#*D>12gb(#Yu%$+TRCsl-D@TC;+wGFZ
zP_}AG212cTv(nw|*v^Z(;mM1y`tg!qTEas}Tb7dR+INPpu*dGLJ>GjXkvo11>DvcC
zWIdTPTbBK9#<{m;e@c(>>MY{-9zjb)dgz;Ac9cso3(QFNh&?}B3iNgcS9jiZf6DO9
za@SXM{Armlvu*F`#pA!jQXg)Z8}Ms1lUxcbsOhh0L6{_(Amw<~9+0&{B5lsBxFv(w
z^=n6c^Dtc-#4fR$@RB7yk%0g3WarV-<MdF_$5{_BVTG^>)bLhO59&UG`-aAL<GXL<
zhkg86mn7RjciOf4%G&xtto`ac#y=RIwc^)VBgf*K&N~QOnw5-@n>_?<&2|0oa}Pj4
zzs4h*^{92A96&Km9Nd=sHdGzy<@}9t#@xX$8{jFc{70^f(St3hGdlIl)F|}`jAvSi
zA0B`b3w?1|(fFsf4rifpx?;o>Gn0j-Fl|5&pM`A?h$i9rHdQeexzxuk%rZM`x84S9
zK&8;*7qluDik0O;gVc0Y`Zo>7Yy-m4hECBW@|ilangXp$w)MbhH?jU8y(n=jRQ^DP
zi}Hw&L+><DO3JP#x1RsSZzZ;EisI0qha60!cmh4tYq-@%I$2dZp1|lT?&$b^-Rz%S
z1Ld}yA8n!R$=Ewcv6XJNwUO~y_&gTE2<yg(S!UJi?W6#LfnO$BqO?T3RAP1RN3sq~
z&0Ov_J*B)11({*e7nJ7qPwkgDEL4n?;B6d?xu-?M5o@l^5%h*mp0Vc~z4rjN8yn?{
zUmS|T6zMQ(vHaBn=S=d`G9W@0UF9#wMHrZT*8+Z=OX%#`n!R-w1;ybxp7epKtN5p|
zK%Sd8W(8M@?`5oBo?LROqwsubG8g7DHx&FJ|C50txQF!0dEBWAun@S8qK&R-Q?9}6
z_G+XacyXkR8(K9iOWN&fB^)TUBvSsTm)(9I*8LA&t)xbZx6RlFV*#73*4JvN;!xMT
ziS86Zer1Ga$k%dUN0V*)3n9@TVNvOGb4i@b{0q2FpBW6#axR|WrLa^SeW(E>^Ft-`
zx4IyJ@P^bapUK?FRRI@*@V55-6P6Z7O@(7#gc=e~jR-SFyaULzw$^-t`xoeKganmR
z9!FLo!@!ENLVUgj0+|P)1l0gY-EZ)p9A<M+BWt<EO;n6O2$#kC`S{j?o@?QKc0O$A
zl~lCf!{gA2N3OIjMx8XO8%+^t)uOE>bETHjFz+2p(@~@$shMJ7NO<~5Xmf1p?@6Ai
z4U$&I`SAD>C<7T2nOA2%Kxhtaz+dK>cp5kacT9yIHOb?+KO`8x(%=bUUh^I1*RsTt
z0$(bLWlT^84cqko?jmPd016+FI|gDKfGm4jg84V~4Zn7cAx|H9Jj*;nuasIh*zEuA
z^%bIHS)Td_VR)3Pp<PvIdj{<}XE+Dw!{Wdj;*t0xn*7;F+vu3R2ghJ%G<K>rxP>V9
z$00L)v9DAT`{FIfmN{9BF1Tdj!z7wLmmzf{wZeRR9=CH<Z<y8)CIQ#URfrb(jokJ6
zlM5vTqFUu&j^qnxa++VF#;T7;1a`Ml12#w1ERRoZk`de^c%psE6hnw^#5fO7yUKd6
zP#VbmSpW#)EN;$A{E3h)<BFnA_$vc}ERhJIX(jWG92Xhycqvwjns1UkxI3yicX^a7
z*{{JiV%WFrQCq5#<C3=1&h~EOw6>yn*hwfA5IBVz#DEC0eqs~`NeWTl7}>e{ignk1
zH<){OSd>4+sG*yHt#q2uXCQ}YR^k<zReQ>dE=F@wX&5;!-i3su4m@%cKJ3v#f;?zY
zi;a)Yg8j2Xh#jkd3RN=obBG1(2pxjt8J2#Ti?#rj>ET%w9rrE(#V9bgFNKdlFjygS
zmB>HVZl62L$8$Jz)SKULUGfC7&^Fbx;D1!wG#Go7OKZxjpQ)bhCBhN!CXsl{#ek6#
z5Gn~*PqT_Z<pPEy&zKQG4aBT7!Hkr9&xiHRr!O7-)Sv61_2j@`gWZn09BH~4prnpT
zqf*6?eh~hfxN-o2{&LwZQ(m7PMF*fmMVxRh7}#zg0I<^t+)%?Ng~JpmQ5~ruuIT7u
zb1@&$2>R}%7{@}LWMOeuSHIByZ9MIh8N>AOX~3>OPM?62V)+gbGvk`$qw;6!`Z55n
z!#zWxEs~pXQwp?H`y9OVnDG!9|6BexC9`(*`%k%UT%k>_()8eL=6X#|z+2@0&v41|
z#j<qjV%KBEhh~ww<;j!#y@kO7o=+azL6H+B4*LM1ZoLu4U!jsXu6twm+E)%^&LF4<
zJKQyDDLOV0+4H{_VJtEAVF`L*q6}-!9cnDa#NvFaa|>Yz`6Wp8H0Of|F^!O{+p))&
z0R+7KLoX>v!*|`(jt3saqdOG2GE_{+#{TuQfnMPcpCxz@#!H%#T~lmgAePd^SL2lH
zsUD)vMq+Bntg7(!MrXf_!A>x!DX)_4q@)HWkFu11nM9zjHXu{0HHAGjUX+*CfJZ2a
z8p?L*oV+na)-m`46yZ{wQ!uxnM8af@%o&Xs1yc8TY&CZ!25-;FO|(-7z|D@l><Yqi
z3(+B7b9oBGYUpbm!HKCnL8uj_mp1<*e)R2J{1$YjgmjQ+iud-zaL437_|h7OG@Bh@
zJabkDaKhFmnxCFJ5*?I672<rJkl5SqPzh*?gxe1-uu2{cpY`zn<fA)j>mp6im4DMu
zVFf(FZaUZheD4R<kKg>dTp10#LB0qgegC%j5&mk(q~jRNKQ)Ro2}M^XAfPb^;-yt{
zIaQ<Q*9HxzF#zJ_woMv(i9+L#1M&&0`jiUt=K6U*+P_FE)5FIjYFa>gh>N|aG>^j9
z21~PzGsm}jGZLFBo%+G`C?NAk+WMjSzQ4SP8`F){9p6s=$YS>E?V<zWD_v{(ek!Ig
zT-)`Ujqse*;em>CjLT6m&Lg}vTDHYemuY@@-vW7?si#gpoZ*l6tKjI5q3;c%)02;$
zm7mq)?gFOi52mHs5R@^lNJHa7gVW8RTZq-^6Mj-bk$jRY?mIkF1X9;O3gg4FXNtcX
zFou`4!>`JgMfgiuZ12q4`Kly+J()kog6KOG9PRIyF~->};=M%L$SA~R`p!!Z&NvW|
zW(cx=@Jo$flpq+v?;z@iLDZ_M#QAreFh+F3a-?5kkVJlyLXXC@8jY^Lq?)D;J)ojV
z9-?sZa0aTZuq4HJBEg{RbF*;5z|(D+?LKS)A7uxl-hFwJzj5M>f*ej2L$m9ofDKVq
z#=P`Ncz=>JpM=!9Wi5yC6^tsPs*XX&p;k3nO?A9p$@DzaLfljQOX#k9gl;hz>aLgD
zd8;63?a}NB(-&bBfp+`%$;)1_xYMH@w==xd!wE#s*^HsKXX9~2+-u|g+s(b{w1J7{
zghex_2ke&}cN^Y2mfW}Ltiml3qB+)Q%sl=@J38HcG3%gHv~ndu`0B|+E#FHlZgt>i
zw;?rPabDnC19+Zs=v;Bpu*CN_ObmF*gx{n*^8snC1BS0hWX0N-5m|J6pjRbCoukP^
zD6YB!3!fBFwFeZq2YDs;XCYH%O@#b`k~F?Ubq9^_gW9oWZK|@G7*u@j76r>3?6%i;
zjVb036S>BjEqrxBl7<aO!!wS8Y%(b(nGgI7@KOHIiRZrgxU>tar$7Tf{Aj~ab`v$t
zQh0qwtEJ=;k%pQ-g%aL=bwLXHQD}BE3Gpa>nGqJ_Z$zJ!2X1WG*4@lA`XoByp9QrW
z=sOA;1SFZ{-*dshTC0jGNWIia_KSPYj4F2wZco2tHNNAE^M=OJrtR`*G2A`5RQdDT
zKr~HnE2(#H%rGYR=<VphDc+XrtdHz-3%hZu@+WCXHVOH0W?n4rkvKpQ{3zN&;r5N3
zM%lAd!wEu{_9LkwXvSTJPXz2kc<X25nw8cgd*u%!BkNo?nVu>*!cY7L%Beu<^dX6T
z<u;S+lE9HK?|$3+L&Y;7rKK(B)TGYHwW}syc@ClJxluc#KH}Xb%HftnLze*)`G@>W
z<D<e8O@-iOIIcew!TXjje8+5RIyowLPt!X|4G-R_B+&DF^uJ^^#}T;j20RsKvc{w<
zBJ6;%#htj9{lT2a<F8e8U=F80f(?~Z27kiT)?KgneNm7ciunUqYmSWDk;NDAJ>qK2
zP?XVNW`C*m1OD%(hkpzT|EK+*%3e>;!q&oBPw&5}?En9fVE<Dia5iyrcKR<528QDQ
z(}VF}Hi?e8x~E=rAfOpLARxkj2l)@PgqeY}iJO6k-oJ4Fo9wQ5&m1?#=6{~h8s`Y@
z(}W}x8oK}NtLKp@TXC%*Cp_CuI2H&vQudDsg611v;~sI{8=mEYruU1UPL1H27$peo
zY%MN2S!P{+wh5cgZ^G6-kAIBM&(L@`m|%^p7|3iT&q$&rt~!gochlE<b?3FVeD45C
zRu#MCOKr1tmGdC1LhHhd{5fX~Q)&#)Ifr0V6|)mnPk}+tUv;I7R^>Z}a>x^Ur78xA
zTR4wt?|rzXyD4Qz&$E32ir=nsX1crY_tRG2F4ErJnd9;SS-MWq`bTBk%(14dk^;OD
z9y56ouatXplB^#>%FxdChj@9h<N#(qp3f|nT%BkHxegF9L-NWqrLvR-{q|dLVh2xn
zh-{i{ic3L=OK=oMS2O4HnI4Hglgp3_;a(G|0O-YJ@&%>Ir)x7S3~*@<)U{2LThz`y
z^eYsGj2h+hk^0UNhDo+I8ob>`(wI(CFAKK_!Vq;p=*J5>U_N$0e=nrw=1a~9@Lk`T
zFMgi_^1a^ekV6xmuI*MW{e-doIiTmfo`#AeLj~n1s%*BW=r_X;fp(bfWC8YBa5K*c
zK6w%!jaW8Ykt545yp~@Wi%DLAJ5!EI$I5(SdkNx#BYcbB(Gxd9u=pA}a-LO7gSa;J
zNjZNBBq#R^N)Ax*@N~VQh$o&8;VUATTwBK49kb7{4?RTB?!QSD4FV4<4=rnYoybkx
z9WsIv_)FbjPtnZE%IbH=D`SWK%q%^*vBYI}zkVGXb_~Tis@KIQVW(YAD<mO8_CwG(
zt~(wX#?WN*6W6^^O`tdP%RW-H{>06$2_b*_Tq<!Ue%az43U2CC+9iL}Gy?YlgE9lF
zLLC<%eyGgZqQMV8`qp5!;l(q@C)N(D_G}=;b$m|)m&gLF)V_?*yQB2bIvUZiLyFSj
z<N+%a(_xTHa8WxE<|AfIEkTc59HxD|pz&f%GQCKaSa{pA8e1grZC(IBnx4tkRQk1K
z1D>D}3k#T-fN`Wu3nP|@R?{YF$M++qi|<#N8LFeR>)oWe5V_f*-RRAB!gv-kS1%_I
zUd-s>Yq*anA;_co)b8jFBBFTFs*Y)?ahQ*8AdM&tiy)9W$6z86$bumQ7(v;5>+|bs
zx=<p*xYFuT7A|f79xm-jF0;IG=?C3w%W7+o7Rxc^RCb`EVa#h-(9yz+DgE(xqECp-
zwd0%db9HNlX}o9S{cO6f352t$BG&!(R8^BZ-K}o~hD2Gh!Z@w&jhWh(sb)-@aSu!&
z+g?Pnw=NkKduiX7P=6E?S!?ptW;~SZD?--h1|kJESxW8G8+qNx1y~0gY|?BxJ+XGF
zBCUu&LL~j7QjhZRJ35vaji1~AWr{W3eQc?ikS%}#JfR+P-$t{aM;FPx-JX|F>jz|k
z1?pTV7Scfw9+4pN>^ySQ^NcM*jY*C1F$^w`XZTpD!FEbLx};*Pp_@t=!Hal%ywVrz
zVL_kr-LLTv=P6n~T|t!vj?YKQa?sCHfu^xXM-jMaEQJvvw<blqZpp(bJF7`6xrq{h
zVMkM`2u|8*A^a;4Xp*UsI;evG0Wj8tigd_`I1=ts?eRRL$K)69A%g0B#iqiA;gL5t
zJV6mfX$rH)GPMkcT}ULtfiNzY$1li+e_G$Y<S-Td9#vBZu>>r+uRC8*AN#8)2{u|F
z^%6BygsBQ)!1IjwY$FKp6(UC{@CeDxn2b6QrT~gKP8ur3ike*p6s&xx9SE)(Z{7|Z
z80xf(3^vyb4I6F;6M2m+{r8Syjv+1}#9R+9<D?91Bh&&cZSRv=r8=Do3PB!S8|o5H
zF4#0>&llv#uu-ewcj}DjR<M0(l|EhQ5jk_J2etFTDpozXl?`eY+ag;AK$$j{*a5h3
z47ink1)QWSp*Q6OvHOYfk2XvzDSrK*KVxyD>LBBM_&$L49*eGm9zsBA+Fh;UWe`3C
zLri4h)RMKfx29<EugsqZr{yUkUe;0)ARf<DmCTR2bQ~C%B^T957KYP>umFUFd=|Bh
zLG$1{5)VQR$@KwZffYSw73DNVgV4JwVqrtf5-~;l>HEe>#%?&L-ASZx(1VNUrqraQ
zB8(fi57XYs1#mRsO<3Xw;JE<tGGvTrO?)+~uwzX$>T^J7;H$}L|4*6353>xXPya4+
zG%{l_IpSl;B~G=P=TwE=LZ(D}KFj%5vK9h{0{oFZfr{lU8Tdqpdh<(lKm((hA+lvv
ze^u!4s6uJI6edq?v>Ys}Y2gRab`qf*GBUzn0R!9tP$ey#PB6HLAP+;wUPNwVDKq-m
z$b7~Dkg?Hz&L$GfhJ($&wBe9ndtWO4<_#~o%jSEHj;Jwb*jzoGy$oIU4wy2UJAZ1A
z(o1wuRLmUU5&VpCbWM{e1{Xx4&T1CIXV>B8d%%&;S|ibXr0ga&xlwC2#S>|3eNo%p
zh@9-qK!NT3L)8%db|Do>>hp=4M1+Z_;1<|<*36mST8Gwb@~1=k15nTB6-hmtk*)T`
zZ7p$-#70bqBAG<Dx0VrTZ^iL-lE?PjW?1v(Gu~N==B-;Ijkq}+$H?+ZCalJdDCueq
z*#+zBvx9~9d@AmbL4#7ame@0Nf?9X^**S(oXRNz|icDfd7_h0Z@MNgqJj~VlcJ0{%
zX}^O0hVTi8R90qcGNutrqO+u)cmKFdza+H%5GJjCKe&K0-s*`N1TOoCj`S+W{WD^?
zKI*=Y^A`kq0aMA9Nh2XqHh=~IkeVs6n;4tUP$wjihVgqRWn$~uyBvR6tvcVo8+by=
zOrJPUy@6P4@7!q^@%(d(=F&%u(={Bdl$a~_E~M9fAv)Bh9Yzi0IU+C<C(O}fD6K5$
zEbIV{x0;V-kA-~f8iKL}!t~W~Z|Fj#mfOIPp1JJxo69`WF7d<vLA|{&pL1u7b?;)P
zorrG0mR!S>Op(CTWwOMwTO(V>PL17*#_X)bNnXR9SI$Sh+4Ax937~qYKI_kh9`I6i
zWKZjWq=#P7e4{8mu7h2j3`yN(r4mXJV}!j@O-BA4-1iOjpt2KuH=>UkS{*H^>qub3
zGImTX-XD1LHj<pkUn&Z_&y7XzqSb?|t9gKi1^9Al)|k_Pf35(Z4`rzsG4aOME%mmI
z^<kyn2tNIooCA=iie)EgW;a)7bbW(L(fws}`axyCD6=$xoEv3crzx>aP$OJRYPFjn
zdlA4jBz0I+>s^JYn|%Sv`0D9kXF9UU4Or{`Q*MAfhed@~h=z|2<p>encyYo)zHI9u
zo5qy0N6^f}S#Ew^R_97A|Km2K(8q2<^}T(u{5~dq*wBa_-mUZ2Q)Ae}9h2Klj4!Bu
zzniMEr1I2IdWCQQqO}`YF_PQBk8s;LgtQqG+fp?mh5bye9SiBxO<G+@H}O#gx%|9)
zB=hVUI&z@CX%_H|9zEpfZmT;vUajZ-kt`_!+f-l9`rW2j8m@A-PG1ihwB^Y|Y6_5$
znEF@-J^vg;%p{!HyySBl1cZP!#qM*rvu8e7W7bHjwiSwSBG-fsGO!-~ii|ACqx>nz
z9bKMvkI|;oLEc>5&fbUkr5kM7SR?D=`$1W5_i)aqVarp0Zsoibc<yulrrY5gny}z!
zZ0wYcH`%30bjr;~+VZzgHvmutx+(pE(1*sRLt&pbZEm-D_&&5uZrd_Gzax*^(AlxI
zrXuMUk`L$+PGk{Ch(3>=MTy9bf?1MI5|O=3eg>S<%=v|32{mi+1|q?WiQ)dPhh{JB
zY7l*exDVsX{IOlACc}nWisdFl5xILGg5RRSnPdFA!4e%WQ)iLJJExS_#VfAh>Fg0!
z2V~kdSOF98VW?rk{%JHg$}~4xC8JP6XFs_#4qcI$ByH-e&8L{2FE{T0u08vzuH{zc
zo0^Fvl>_O;!!Q>ZC@au(uqNf#PTXuYZm-=q7IJF#&q!7C(PD1laDadkx3{*$03TJq
z1hF!ZXi&ZirKgeuQVFTs1nSaV7^Lh1Mu}@#`5vug+!ozicF_3S_ffYXo6$N{Jt`a2
z34s55ebck)!{%Q9@rM^7t5Ojeml6B2rf0)wE_E4Vj{5KA`fe=AS5>}6&wHgC8Nvr|
zVJE4maQd8Q{fE*9;^NKGiciGDicR>{Bjn2Ub`%Nm(HFg+-qIZu$I;<*O*tk1JqCSd
z<I;sW+fxO2(Q5GhY^ehty?xkMhlO?Z4OC(mcU<3BnYa<*LZw$k{*i*w3BvtlISBN#
zXu;;~H9MD!d}Vv17OH%$mM%!L?6?sA%N=;s4IPc$2@UgEq3xKi?WMWZ7wh`RxZ<$R
zBs&-E7hC^>Bs5R<KqWo10+Ezr-)cc(a!xt#+bVyC!mugLC-E_>6iHsLtk?%Bx)c09
zmUBxCht?Qq$!DK41ksFSB`Ttu8egL;A>u?Er&}t2<58pE7mMFSZ2t3`41TfV?GWT@
zSh&nPKK@9Je|?q`uL`|D@uU>iOi8_&fW0<&YgS(omSM?H{R=cBr^9C#@b-6GR)6EH
zChW%r_t$l1TD00>;QR6m%L_td1#I}bpy|V}nxNg}P3g}Q)wDG)uUq*aa4+kP7=m*b
zQTH72h%`?QG?QZoc9;2f+{!xcO<xvzsSSS{yFTCLa9;Bv(`SpmZJQCXL3aHcHbb-L
z)H?@t?X+91jrQPm>jV7ZF|s97(3>UcN$W~BG>^5Co&xJ6s?rHo6gTShP@WtM+~Ku&
zB@1M%7X`7%(mB*C5bHb1i;CCpCa%fBkPG0cio^);Yyh;G6{&Pf`tu|QN_wJ05^|X<
zZy|tsW1g8a)4Pc@sFP>tu~vq$TPX3Z*todo7$~C376hjgSQQ2&YbRP!`Iih}y2uq=
zl;egk<C%GAiD0P)LXlhH@H)UTH|C???<KV~YKOEn65#YFj<Xa62>tNVE0Lys#TQ|x
z0Rj}^yQzQXk6Fme<=3XcP;v-nlX}k+4at)1N?V2%MhL9;@Jv#eVl>AjC0+Qkn<RKp
ze-BhfH0KY@;ks^{20{JkC?S6WmK!Z8s|?^O`+&Uz0#$iY%aLFbNdc@|t8EQDk6#kY
zq><d64MZ!Ou0GjmAG~{@X#LWe^SW9Dx#gofuk#WM^qQQkb(Ph)EZ+e$C+;!b!ksEr
zrMBrP=+9gf?cuE1e$~brAhTUlqaqeyA&$*#@=&tlq&n()Zo63tXrSVO2XXD{e|BnN
zeB&%=+LG(s4sC-jzzr1lisGMWSUQ8@;KMUOoovm(l&fIg{^Tq8yoF_TZE-YlU86<Z
zR$ufqBTA~Xge1~Nj1n^cU8#vPjM#XPH|6_KzMw;J{lgf2f+#%FR@^dp<K)n_Fbr{9
z+(G>1s8NBMq)iP-hu(iN4riZDLzT7D)rh9e*0oy;hb<Xr>tqABsmcA2eLb*xfjB`X
zO3`h<L$fnRx`0;Ms8K+Q3jtEHTqD#rco@i`#}<1ae}?TqN^9qpb8OHK5%pP9(^uWm
zd@#nqE<>DRh)XZah}K}9i}lo&;JU*bi`C8@_9xhUpyFZl2fW^Vcvw=0TVR_wrtR^q
z$4@@Te3X#Q@AQ7z-*-PhN{Oc=t3lZYMo#dQMbcEte)QHv;ojb}sXoJ)$-~lQq2%u{
z8ZNnBxImYcAYE2|^nCVE8(--X^!5ri;pI!RX3a9b$NbWb0VO(3isC~!=EKl`&%>Dl
zrkfdUmwhhK``a5n7>Jz*#Nd&>GuUj9li9x%OFX2iTSXjpHUxVb0XdBYlJ%))5u?E(
zyhDIR*Ro=a4a0GuN<I2)mg>@Z%)!paBoH6^mJ{%GKU_&)W}JhU$4!W^-A(91Fz)RQ
z8CR#*Bt_?{tDOu|QjLBYI%ve+s;AVP92~X(yCaFc6XIpfj$pi%3rd@tpjcglP>#?2
zv1N?9xb?Md46&9O<ShqRgej2CTMdV+lgp9P0k=xc%0o<$#Uj9ziSb$zzycf>DPHgI
zh-gU_*%TATzl{tHu`WJ0%2B11YA7?lXsvCNj8W6ECIQS9>1<W9qh!RA^A4}J5Rgq?
z34$eAQmc4H)W?>!XFJDK%>_Kn8ZQH8tmrU(QDl#9-D7=udO5WV7!1W`B{kCq(zPBN
zHIOjko7fnVr77mL>Ta>52$cX{8OP~`&d_4^Vex#qo}$0EnPpjB;eHEqa(Cs<#mkmU
zekisp@BP$`5?qpnW92Zt?y!chhJV~^{jq?04Yc8xuj6sKb=)QHowL3slimSD6(Kgq
zfYMY(bGzz%Y2DqKkGI2KM`+WEZ_A_7^(j=7op7)Q62?%dH8il@)3;CO4yNKb>dT^&
ze+D^M1hEaRIA*KQI5eoa)epE*vC~k)SG%uLfI1Du90}n*ZA<$`0tTtahV1V!<U>>d
zm1ud7tMh3K!?^+%-v(pHoAb2o?jaY<kw!kBca6(%D?y$)D8a3}Mkxqy1`=J&-~i&|
zbE$Ny^hfEln9|I(s`IpvwVLg>NxCIg8M|NjX|x-`6W%{?720_Tpri?HZ_N)jPTnQN
zG&Pm9Ge5klJ3v=SsYwVmvu%8?m+n6F17G=F2R2aRyM<rA2a{A^w!M)20IcRe=Ppv^
zuGr{zSS_Dh^I;SwBtc=w*pTk9ErkZpHO0A0vGz}EwOSKo@aXH|XmZ|b0>-ykBJY3E
z4<m!wW+B;TKZZeZ6J?GL1n(T`jp9X$7_#&)F&IH1ekli<+uUrbLK)q(4Wlz&WvMLR
zc6S8UOaaa56)7|{b6}OPZK+p@iB5*qebZ9H@i)-C>0LsIJ*ic)@9WI9V+uFqiImaw
zHod*`J&(PM6TEkJ=jpjV*YEWN+(dMZ;4C~Qj(leh2?1r+ZlQ5iDwA_E3|<4>2`~1=
zQ|el!&<uUANGcZ93IMV0-8dHXE^yYkZ|eHVx>qD634qls5Hnv{$_H#Ad8}ks?_6B&
zY!KPp`nSiLRUYUcArdrU$l;!lh`LrXpU0Zaj_}a5BjG`Y(CnfeWX_lLGXpsKi12}x
zQAum>;t-d?pg#shj)<uEzV+gN%f-_c9pwZS`zr_D+<^lX??AxC0C1_TODmA5N*Bol
zaDU?EAYWi2d*MyM_2zt~SxR;m2GB{vLA?LwAA&R@0h>!?7EociMmSV<1c65fBjbU|
zKrFQ(c!cd%!{~ok;thPJU9ZX3FQ6r71-3)CGUeY<Xs3+ISxP-jg0fN`#Ytr7T*)nZ
zmHGz6_%h`*VfQOYjL~)Cu#Zeo9BQu`dS6^9-SS5lii=n~+pDI-sxrYn31!wfp!SUN
zgRq&!P2r$oFwhwMQV}_A5*XXy5H^^!WP>@}%cKFC#X%R);|l_D6$FI_yeF}|qLQsp
zgRL<vkmY2qGJ^_SUF<bM%bjo{U?W8AaRyw3#2VQzXaYye3r_oi{<{Hf%js{TC@>IE
z@IT(nU;nNP|Lc~Yo{@pIwV{ELm7a;afsMVj$$z5_rzpru4+<c4-_&9`N-+hqtHOl%
zH$(OZ!FZx-vQH1$W^TbZX03dBap*Neg6AA~*==`u2qM+d71*Z-&DT{xC#r&tdo!6`
zs<GPInZsugwT{B5PJ;9*#49SauA1mX7VQ%HevoR#|6xDe?KgS>MeA2nQ6+Z`!z})_
z&`4A;N31!rR)exKKt5ZNBo;<a<8WeX1>x&ROT;11mUt=|rHUL7ogcj$&xaFJREMkC
z2V5M2@GaI!><n)aZx4Qy(Kqo;t6h_LXxgJik5N??=D)>**{X77C2&Pse2ei&wp(_p
zfQ#q>4}KH<rjTv*Lw6Eiiez6Cyp#<Yw7R6}sj2>|t-WWxVwZ8}ugCMk&e)-tf$3@J
z9P1-y>0wjs$^@N*>q60-?+Sx0XBs3ZtBV!O?)Ty?`kr@YZ1NH_IbAXNzYY@qOSzZX
zM>o9qXLWF(fq-!TeTDu*xwkQ}|DUM#!)iNr|JZ1KXY~vQq^VUEvj(I^$>v4`BqY52
z%Pbo<k2}a^4G)~uApZW$tV?QlUa`wc?51&_eB5ohoj4tE#?rGwfpR6mcLsuc@ffe0
z|BfttDP*&%D6=MR(i8@N@<B`bmYU3T=&_Wo3~*9glCm89TdFaE=dOx1S2#e73Y)BI
z<?n{0=sd>?5f3(#(&49ETGEF4nQCu6{55lcndoyaBD`W(6N15}0QMMN%>t)mPxJRp
zl0*y^zNCMztXvgbhjKV2RayWf(hwYPwR-%hrC=6D0KSlN!eH38|5mHqn9)U-Rbd7G
z13$`rVni5&;gmyKyh(e3{u`Y?ey;%~{8HHJFvR*=Aj=MnV*M!RZ!K)V+((lE(v({Y
zkxHfRDbb?YpS4Se%W)!=Z6?l#ap`An*DEX~FLE=AO>z~_@ihGBb&jn8|JOAGHljx|
zCKr&^vU;B1Qe<={^RMZ2i;}xIG2$Bz!Cg(p?h&O&fGe21meTTtS&&sdoBH2TTq&oa
zQw;*-+1B3inwfEb-jNE+68}n}U`y(j-WHXR%R8E6oWaT!{MZ%z$R_BVKP%-o5;kN~
zR<bZi)ElC+O7aratygWlkGgVEfbHC4<5d-#;;OM8vNhdSpWk^q8TqVY+sG|Z`u>tF
z^En@?0f+CFwO`n=Hf!YfhcWP!yNZlNG}!q&!gH<8)?o95)Uxhd{3)u(V^WLc!PPVm
zA<yx)$6Q<_(BZ6<sXGgH+L<riDfLbiq)_fzfJyB2);&FAH0@oN>wSL@mE;5`I}?_Z
z4gY~9+3&%~VLf|%^nkN2c*NG3>|awLDw;vOA=T~h0%iiglmKFqR9a!?P<<ps`^U>n
z^xSm1PW73MHdEW|@0g9bBg8mQ)@zc6-gIoHP=U>BeO93+qwoWYo-RptTpVoh11;VO
zK~-D}8Z?gVn!Nx+`xce+uS`>r6L?<x$Fva2Sgw>RbI!QVi;lc?H7Trbpnw0ur3*a+
zq5g9*SsfYZANKt3%gg`k_{`M8*1+1r)5P(ALZD^!9jlFh2=t;yyos3wj+!-)kEAia
z5+=y8)={PyY@MfF){r7~FZ8l{gZ7hr(Ja|XUrmT2>DKXa&9gP`3Ru;Y(#KIEk#d|N
z_2B<C|2vy!22$<JidbG^s<;ONVL0cY(3f0PkaC(DAeo<hIIJZh`-fRx1Sa_ehxEWw
zOrr{{mUT{R&;e(<a_H>E9A0Xbouki|)VNv@kGoml$E^#dq09L94}3$4p3bk^N1n;u
zD)i|w{d9Z%MAwbv15PR}c_tLNRU=6ybaXSVeB0c23WR<$x(6jM&aD{;Zs6GQPH^aX
znSl(Y!zN5m5?#SUT-*`Z60Q-kiZXw*=?;%6XDds=<L;7@H?N^^U@n&Mk(GP2n4_~~
z*w3p}ic=Vk%x5r<oIf6O8Em80tc;vA`^gD01vc|U0{T%aH<I#9majehku08j0*Dyf
zX3tSRYz2Vb1m8WP)=6;vfKnJyeg-||TzKRYg&`t0!$M<l{q{(u#x5`})1h!Ckq0x_
z4t&b15qziv&p_DPE3N=V4Q6mqeH@+<@p`IJRUTWi?T{*<1Dh9{;4bJa^ULf8q<_rr
z_$xv}u@Y3anzg7;-18EnuE;m%2gDmeK7!=ITpIHNy#QF7o%vpS9|mV=_lV?FmZU5`
z?IMw-5XQ@{9v|{^84b?!hd*c%UuP|bUBIZD>VFQr-5>7s8%2?19W!JSn+8AxmR!BC
z8k?!}p>V4OuiC7;Lz!L7v}t-w8moWuSMFNKy|c#20)<KsImQHpG6Wt;r7m`9`@|1n
zN%PT?+(<)%Rbj%%Bo;VXV~U^41Bp&u0c092*+%?Wo!E(ZatU4Vkg)Zi_1hGoSIApj
z-FrJTBd0(Pl-(6_8!u439QL(7*?S+zzcEGIC?ab%0p1x016N>cBfJ>bnv7nftUJip
zDn0UCeiu}=cF`vRYbgUKW}DP~{UPr#HS^|aUaX$4BtDPKUWG|&XS=I^FKjZytW)d?
z1$%~D-1|&l{j~IL`Lw1mp-PMcMkwiSt%8(VO?#|DP0_J`{;yw&PBGxnKrkR6(0}%>
z|JO!kZ1F$h*_DR0-2ppN&xbnFOo6K<R&&y0y?{T2V9S;53ve+nnuP%4f`lPS1hu5*
zUGL3pi9~8naX{l$0XJCg{B2g;14)WVWS~OkQb(!))t)F>F}ch5FJ#c^c8GCDwqKWG
z5nU?+Im70xQd0|-KHfaNCNyHor3D5T{<jD&{9Mb?uY`FS#^6LG`|E5ouFogoyAPcp
zZ!Cw)n$^SYlz;G8F-=Bg$wDv^#)6_|FAm%=m0;HE=kuG+kqgDpQ)Q|sN-~zvt}`5j
zG*uOYV+DxmQb1LdA+aKq2D@k8WHqkYz-csL99cW;kxsaj&*{Q`s~sj&HU+yongC)@
zi*&tNa$SZ+<#nES=r>x&lpn&iuH2n@uXsLzxKV}UAP=jFcbc8eLz?aSbJ{MW#C`7C
z7;+zn?Yi9shG2~8HQ*dxzTA8sZa(%A<*R(dGt#t=sZRCU7x3ptzKN}~V4B*N64TBy
zlC8Jqzm?8&sSQ11$<m_&4BHb>N1;=;()~KxLnK;H*`EynBPy=3bm+#fl_Nbci5Ri^
z@zMnz4cF3U9t3Fc2c8;8X{mqfz;b#<OVu^C-Aw1j`E6Ca;h@(SIFinAqZF}sIy0@9
zo7uH$a_T)+LV+r(vTsp}wmdYa?Z6qnm^WHWxhOi_-+c4%m@^D12nn{?j?>CgNt$P5
zGMn-ZWK^!~x$&bbA*~+VA>mHYL<uyy-yb<Ng-6Q{vL8_B`@zU^c4{Kuj<hT&kN-c)
zzJW;)ZdtO8Y1_7KOxw0?+qP}n_Oxx=wr%XZ8ymav-F+K-_b*h`IaQgJm8BDXsxi7H
zFRuCKmt|-KL1eFuY`M=U)PxV(0bF3`kO(F=tRm-Q9Fmme9Q;PcInb&BM5PoNEbQln
zSH=^#fzNQJW!xMithjEJwwp;HX-HdM#{>VXV2{(YN>3v*u9E_h_$*VWb8lM&p6i^5
zxqFvOd2TJa*7K0T|Kn`&(q>|CQ)Z1N(1XEI6Bcy<bzqxqUzx}9!HNs9n{*)2T(qQY
zT&R$oRlzzz>aRvStpVg>JMF8LX-#|0G9NtlLCvp#UBI4nubkf<BV-}O^g(PGC5~wV
zkPh$Jg#m-h7KBLIP?s?2ip8pv9H`gOl6z~_7)1HRgV9JiSdfDo{7`UXyu?zFGfYy{
z>`j4r@HldDsWNG5$<4HBc;%-mzy=?|t@sm7;`B%0bv_tAeaKU$amXrG#`Diho&|5)
zCU1*36f-u#cgtaG!d_@7mK*X`H9IrX!-yho|9UFwgT;sI#OK!#-paMh>f2)dg^61&
zvheEEW`p|c{F?p*JpzW9X~$zrY=gx>Lnf|)=g!`WgV$HYXsorIsn)QKr)X_k*X_mV
z?Ac14ESE4z<-^yuUW-yH%>_RZC?bi*WuhNob5~IBFk~}!iEMlm;Leado52k>nru9C
z#SxTigImgy1Lu0$sL5)Hmoc~lzd;T$e*WVR47sx}koEFQKbjk%MLCn|-U!yo;Bkhr
z+5l2^I|<nVjK)TXVf`BJp1XPGB$$|CPw<85thx^cC)t2cjha*FPKYPX+#anq)^Bfk
zZs4Ks6^mH82aShM47YzrNMZ;>x08RRJpYA~F#p*<{0kxdXTZohR@!oa9y;Xa8%2N&
zEkt~TmIg$G4gHeY>_$ZUXwdk(*IA|i)VEfQb2U!@t*aK-8^6GVLMj<9j<EBdj9L}$
zc9of`ILExqo6>Ssv7E=(9v$V^uEejnk<vnrdA2o~Q>N%$L7C$I4l~=vq9j*6>!RLe
zfej!HtWi-*`)a;@H0vJ)4+i&-J`P%gtqiFVXU)jiEfa_DW{mOl1OA)w-j~N>^Xsv;
zi!VrGNM4aQmVQ}B{sRR_C6=5L=eD#tYT>0;K$fXiGlts#yF3Z?pYi$skURZvyi>Z`
zzUT=C0I){>fAnkr!aG*xj!yqczc!_zZM(sW?EO-!w}ke`8ALrsb22UTiReNDS-Z}~
z6B63LzSi(hLz#k-!Ous0l93{Ed7C-uikM#f3-0~xtxiP25xdL{4CthuzgWO_KQ@dm
ztlaKL&!b1rK~^<HcAllw+Ao9EOAlCZYKpZydaYyXQuq+^^`aogB(k|IYWyf7FyxUn
zfW3>TL5{$>+tVl5oGAWbDkH_SYp<)RG14I^OUe4$I~rJ9_BpQ3%1++fc30sgfMm4P
zw0#&muyi093VkD){a<c>xPI_^(UK53<AmMI`9ONAmMzIEDo{6Z_j9A{jRjWsr7;D=
z7MKrtrU(uUXKoSut&$r-|46zqfQfx{>DU5FIsr#jCNa3m+y_08HPHOhng-EM7IZD-
z)j*6OL?#K_KLnGw!LT*LJ)jn>br9K(J47Y_=r8}1T$MxK)rXNp6G_XY_fup8cGpmq
z^yoe(Ik(6Q+p6nYL#kI{adYiGSgJs^B|F%}z(N~KjW`W;y+Fqt%DYmsKMt_as?`jp
ztvugf1~a6YNG{I-wX~(NfT3u#;OSQlFNje#QhJWGsf|2NPE*R2gBhMB47(<Kdb&#>
zSlTq8!u%-z5QlRYj{Gn}V)p_8*L}q8uT`Ab?G67)WyqY%B=B_Sg*UO@yV8n<@uf3e
zu<9vHu`T{hSvrC{M%J%$gFZSPRlkm(Dl~``xANuyx!6(2?p*6i=qH3Eybf|;uj`~?
z5{*7&zEIpDht?Hwr;~EA12_0Fl7jxeih6{2W*?8Q-ZniuXD1@*KlUncael2)jOdO=
zxVVXqsM`MlOk6`!u>-XGrx85N5=YK1zZaM}5G>o|>u076keqycBXH4PE`e(i8wwV4
zch~J!z43x+?FIJ-`78WHcaCM!J_pdCl3ef;j_40id@;IB(x%ZfESp=$iip-u<rFDr
z8C?w(pJfin3)wH?sQXN8S%K%gqWBg*mK?lH`HNo_X)`Fv#tTF?bnm`8&&nfEAMH)G
z7cQP4v5WAyI4A=|EMcPFTvMiIMuue>?WAn`T@wO{S@hMGC5jX$8_5r{0<b)}n1wQn
zcsSO-+jY+WJX=+Srm2{gB9>_d6pn&WX7);*Sefk>D!K59=>qKleqf6_x(!zMw>j%$
zRgRC=RQ7gb%7`=qH#JmevNp;f%AF}8UoVsgh-FiTMd$p!tA4a-QJmv~(s(T&hJ`q{
ze@5M37+M1|Pz(z{FD$&6Qz6$*pNY-&r_)H5uc~U&;V|5B1siqm>oHlMCkf2tsz0`#
zvpTm3KUqLs4a1PmM!=FDW;GB|42_81U}nPPB07<szOpx=cNV_XFBf(V)lwOG?Jv5^
z*+LYpzwpwD=hn$X=>{dX(97qh8E}E65d7I^2X~L+FlQ~qo*q45<*I%XH~-pnEGLV-
zk$vr9@wPg?ZrGqBZ*pkzaN=u3@QzLt@?Xs*wD{t|$D}({6-p=!l<(omy@aMQ3j~9y
zayTWdU<{(j3!*=)vTNyIVPM}c4VH9gyvU~u%CqdeMOHK91g*=M8F0Q>2zAm-C9or3
z%n6loHee-pw!YIu^Rb+&^(k?;;aO0k!7)m7PgDK2lB+&%{9yird6g~7<jr%ZH!;}e
zw6<I!zjBUz;1a^vGNDc@30eh8+|o%|MVZRy?MR3!6-t*K)6HUzNoiiJzsZLV4Z3ed
zPQ%}blvrH35rIYmwErGxkmiA}HpTL2CI`J1V3T2a+PIg8-UVt?w{ly-JUzNC6RGRX
z+^q}W0ylbS-yqy4DSmz}dttn1s_)ENb+<p18>}uzwCu6t>7^6O1hwO2kgt5b0vA}R
zqj`-IeS)8z9e~YP(eB>Oe3-Ddtd=Y_8=@YlxEiNP?`qkSZQ!dVGvTIEX>Akmvl~n{
zb#?P$1h{eIiC+2K<BNvr)b_zLGBn!?5^UiXu}vq>VcChhwHLKBW;?dB&qLM%^cKlI
zUH}Rmu|sfEWMdHMC3?C-W}zrP3_u5Fb{a`TZQxZ3UFPK(%r-i~zBlD)&NIUls+`!|
zH#^6Fa6#VZ^rCw36+)Y1MBSskzF%i$wcg;a_?aVx#r0H*x}uAfvrM|+vC@KCI^qE~
z!iJf*Lpie!eVuE7#~U1Cj=AkhNzA>Z`0OBG-X;CQV=DV=J0v+W!_A#cqkvxOR;1Hz
zwAiKJF0haoIvc3TH(hkmq4(cb-`lvFQLz4t!24U&aQ;~!|5ecR9UT5M;cc{%WNhZ&
zP52L0wJbgXa)L2jLSc)JjtY(Pt|lJgcou0=$ascD*39D_*GVcqNE)``9m|luoRfOf
zXuZ2$rlB6c<2%N~J1btpOC03qtTUD$fb|@C1CIkbI|0yeeO!iEPRT#;BmJnD!Moxa
zrt9|tBp;W40;H%cVn`0J=8WoU$S48znC6MdYuUy$n4Hwc4;%N0+Pg4=;q%&i%VoYL
z>mwD5{G26cd%;vJ#JTn!QN<!4-yzJY<oGkfHH|!&Nc6+ch*5ht%^@4E2Jg~2qg_0K
zeR|pv*terF>v(@`<!~<y%4KThr`ckyKl4C4riq9U9DPx%1G>c)?n*YO<28LF^K=_3
zF=P1#X3Y!2j$@%jkF{0<4}QZg-Z8dhjn7uu%(84Qi5NdgAF3BvISG8A)YhWZ8G^~!
z%1mku0QRV7mnj}G^9_3+k{O5AF#>x3UVMI11)SFn(ki&8D%1mNy97AMyY@srqd|qD
zt7^18*a|wW2pQUVt07ubD93{X=;Y4~DoHF2hlqLfMaUT@BJYf7kP(@B^QiZ%LQ;J&
zcUNr83U&SJux=Du;$do+Xrd~3{{fqDa+f6H?9@z=yhp#F^%D!=6kg(d!p`XStYWR(
z3Ca7QL+AVs+?Yaf(CFTy<6)d<onI8Y;JGeI3#k1Lu{GstU8d%kSFXl)Mtcf*mJLHj
zBneXW4tZ{ORDN&#cgkyQquA8^Z^k+Oweb75jI(yOax!-^b1>F7`VXYuODWoaIWOdp
zn<pw}-8Fcr2SBXgUJ<rm#{Tst9T<Y-&-R5<EkVR=Q$0KJC}cA2#84AzCz0@yCp@Az
zL}$eE$njXfoBT4$!Z-X2-xs(zT2HkQ?h}@5s>d1AZT60Fay4Q>0sUnOj4F#Yti9+c
z#VW&d?TG2){^EPRcIKW*GgS%Az^H^`a_t<blfqB4ze=Y7)=KPQt+1+93xzYv6&zQF
zv2=L&OkMi$I6V2_B)csPuk9{bp`8i6bvlq{KINz)Acsacn<jR{Q#F7NG?=WFz))e-
zA@h&~#7O@%J4LK_)F%31V-v4L^M(SQK{WNLS@EOtw9~f0-pa}qaoLZVEcX;w7ur*i
zWj>bF1Q=x2$%Xu)tl%TVbt(BQj4spKoXVudod6T2l#k!8tqNqEvM|hZP?YZ%5~Hxn
zBIXLm1i51Z3rvI-@PB{@fkA}Or3KlAtA>@;5N<em69UW_oPpx#JYA0Yep}A@<2qgC
zGUk-5aQ9i5hGC;(f0(2=rvf31VfI2*F;}hdRw;98OIt1+j!xF>w6AzdoMW}g+P>Jc
zWe0`Ri@J%{6`NmPg0R*Hi-KoGknMBo+E#twm-%Ax7Uf1B{579cAh6~?Sf0H?`|>{R
z<r&e!Qgx9bih9#v((EamI{pb@Xy3fUn%%5PV{QYmvF*n;=Gx%MB>;w3H{INks!wly
z+f2nR+b&nYc}$ZPJR=8?6Ab-~V*~p&CgKG$j<2)Rni>nN>KP||>i&Tr^NKs%UHgqz
z@*BNyQg`ihZeqiW@Dom8Nb*P5j=KcGkOU3IP(IH3Jqu5`PxhabEip`e!uemw75f)*
z5&!eY{BMU+jQ@|Rl(bxj&0p}f`GgWg=6SbR?pF#e%w<H`fDGEl?qs0~7g&(^enTWo
zVE`Lscer)OEM`YvOQ*;+7kn!<uBxH$<Om-e8`Om|ioT#3f#CAzCh&w4o9w#TodrxG
z+s%*ZA_sdV{wmJSEvb%npCEDEMt=N!rr5}Qt$qap$4mJo^EH+cbHlZk8u%%TrNY*5
z7!cn3ULdF@@YmejDMBUTYi{9&NLgV5`Loeg{JU-xHCoEGBC5@G6(YGN@jjC&MhZN5
zgJcdb^tw}R6?^EMWhJ)r1H+qoS!0oX7lK)fXv>}HoMPL%k<-61a)nsrQcr(lsPvbW
z!~JI*4cwiKbse1?%xz5n6EGF5G!c_YkG%DO(mDoK0ckNyT_LvCBu(W!r(#i(DR0bS
z2PrgM@=SunTfQ@gcfkT8Fh7Ei`uxN(!aEYnL9IiJ&!S%bLglvFt=svWjNe)UL~tL1
z87aW0zBG(Dn$1)}fb9JnV){uWRp`|0R8d`jHgfUmi-32u5ZP1wm3V{yf@p>Ju^6(U
zB}B?%(a#~diJ0+_WFO?1%u%(WKNU;`9l$q+8%?&M0_B(bO2Cq$(=}|?q<mXJ{v`Jq
zIYfYx_{pxc<ZCi@XuWZ>T^-8Dw&oeVhI2p0@opUM`E369%#e-9^)O4-505i?^e%^0
z<UHVy5kT`nk^ir;Gn7$@=dfis7lpJwSjT~|MYCv%J4Bx9BDBIBI3AZ@^{}aPPYEHX
zD+`P66ops<<~#PtzUUdpi*Va%(QqY;D+)2R9NWL9_ZCjBOonhhFGMO)i7(T7Wq5$c
zv2TOS7t(F^Bqf9A;sy)Ba=iMHbno`mhPM8&lU*C|j%RlBh;P!QvnNWVuN9}b)sIm?
zarM^bHbTj2P-)H#DR!QCmwsj{!Gzej2a10Oh}$3@IEJLK{>Cg92wsoe^tXL%6q@Nv
zq~LzT+gy!W!qfFO6`{M{nAuGn`9RodeoHv{L^e}@3)Pzf{3So80L5_8>fA}vJ#PL3
z_RmL2zvW>=2Mz#WM*skT|IcLlR~z%cCdL0jUAa>GYq-h^|6hiy?Yc&bC<dk4{w{#W
z>^3^f%>0+#(AM~~h*$LSLeT{oX2L%oF~}r|3H!|I9aS4a1@)r`TLULCBV5iMThnPs
z)M9>WQ6i9{vL9w`46CP3K7HxzhMyek!&+JKaY7vHeUgZV87}R0c75tW+_v%IQAjg_
z%o6Bm<dNkwK>NW;@G%2TvA5jD`lMm1Vonjs@~FQu?2Z$ei|-n=e6*sR9xXGbnx*O`
zvH&-G<8sRZdB*Bxk5_xh$&h3W_u{E~8iB|_#3iA{zYjN`)RXk&Y4_N55DlNPefzf&
z(c1^zDGr?AxxwihHKiWs>|u?D0KO@In^0dCGQbK#8cV#A^~W-dAOH%?=j3Y=iqk;l
zU^h%*|2!Y|a-nzOxQ`UB>EIQ+@pnE_BLQ|q?XeIOpAr43QP^)}C+*IozINu!_M?`g
z><#AwiAjK0p^}oh`NLROu=)hp^uQlIUP_hk{89F?D!VHSGbSNrEkY%`C4Og^L`?8h
zFEq!a9cuX6<%dlI?-yVXV+Mv?E!yCYMSN3OSsJMCWU2JQ5Yz_*q&(0LiztIA8%VCt
zBXqOVBg!Ru`ZNdK#m%{vJvn;`@zwrwj*@XDJ$pAc=f;>_@RfS*%$h#U=fS9azrc2I
zXVg%LOk!5aZ3c_3jaa`lELscivXrIRS?;#bQwzz-GXY5)f=<GAW#gv1f%VPvvC&!k
zW$n}W^+q70_=woRfdLArEMVmWyYFF{;OB<|E+bPyto4X+RrtW$zUnb$Pyl))hz7yv
z6mfWK(WEDq6GO5W2KX&JN{grVfH{BMxOaOt+&I;jmq7w>^#;>m(Y_inxdnzVt|4R)
z4^)p=Ge)?z5D}(59I$b(cd1KMc#6RJ<B~V5PN~+E9jG|5M%vQfggC!?e7iA_b+E}t
z?q-w70S>L80D1mOo(p<lH#9(8P@g}ol9$wGM=6HQ9{fneeA?`8)$MSeEQ{z*dJZ>h
zURg!!TB;l!Rm0U_8E+=%mC%Q<CnN8GUgd!s+}{7dT&xxDUXgHG2r>>BxNcByNu$1(
zYFncLQ76UI60`^jbRWypN2}*H%+%$0bYBG_2psr`p1iG*-dJw^sw<D`cRk*jY1E(J
z6|(pXY&4Y=SqB@uvK5Y*U04%;k-3qN0jW5L6#8w_e*ns|1eX$`G>jRoGR=OJbCq&}
z#ZXn98kj!?v0>72++#kO$S`Z1S(SlKq8#&U-LpAiE^NR5j*`)Nlmd?f%bR)Cue7vK
zf@40BTJ<5bOL1Z<n36<+yqZLuMNJ#tRn24C%xfY!!ZI)3jzDl}E@M@=me8$Yx^%xJ
zeJ~FLa&3KaeJha=@4z)Glpl>znvxx)^4q?<p|KX^rB7`^D&m1>A)$RZZ`bC}wAqKV
zN<n&dAM>Z;Ij)m!che;b9q({1Telak*I9e7P0-#ZVAM8FqGr7@QRUdI@=8s2<GMln
zeoz>jI8RQbAWh=H6KEBU*0Z9f3DS|8)N8x%d~;N)63*hXb9M5<MS3XQ1I}q2rry+Z
zIn4*Dv6yOMyTLhkk}X$M5hcGiqmMuBj965jf7SxJ#;GLbG<={*dSOAcqc$@c@3ca|
zD$vyM$acrv_oM{8BGmY=Qj>bAKN$>YS>uj$fI<Di?Kdoc7OjH-)!OlvG3vE94D6pt
zdraA#&imHrXey^m*#Q*vZKGi)?4wt4tgw$m-vVRrB5igOtk5xAG$>njIILk1NBsyW
z+Nze5L1_mGway4tUt)16<4Tn)%10vPseVqp4d$A;Y8;>7DX>zRp%Eenv~uCx2^CNg
zoNk|Izj9Zx0=Jr&JB8_6bJ8AU<KNp4L*#JV7PwJ}9a4RWT46N!%R1hu0P}L^>QD)}
zs53(cr<b1c%oo#3%J<qY_<3DB2{E5h(?V3}Yc4p?LB@V8xPBAMt%4e&3f9NJHNbTc
z)KW+LAhKFFr&3XffIR^d;&eV|rq+Mvx1Y)v^`=W01H<#3h#5{kR5&v;uq3@!c41_M
zw1%$O6i?SHNk?j`F6p@7$gmp&K8+Cc*{68L-W$K(l<7{QS2wCO_BZ8@`;?Jj!;od`
zWh5+a`#O8Qubey*-kJnu#+#N-A%>v@AYUV`nG$;-$$?aAFSc-Z{;oq^shpv)bnxqQ
z+epPO<WQms_zXy>>7X<IT;c^|DYNRZhkn9im0216Hk#nY&1wnC;w>fW=xFzzQSf2H
z!|<MPX*3Qb)K?<YXjLLQnTAm`_u8P>ff=WM;BlBfhD&$EwEvBgH9VoY^`U|;$ur&J
zZoP9=nCCvn=G7*M2y2~z-^b(GsHW?9pi(&aD(ilR@=cUmaijSAmcnwe282%EeAKS4
zn0D27m%$2Vm^IW}Tj<mIYX^O{6+)i!_xH%I@l2v_V@D2}KBlKmtDK5(6?QPFFVH^=
zamMei#<9QX?i}{NC&<l=t^OlL(EkE=4^*&cLs{_9G=cs2b@J=ceY|;HXc6A~<jFJy
zW8#~-Jw$IrGKo%qR@aqOTbv%xmkJ%JXQIdb>)p>J0;Bv#n;Bup1&AqyNrMFkin9T7
z)k!2nt76ch!5n`ptKy~JCDgJ{v<<M-Y6RabzBFHF@IBmSV&xD~tZy5IF0bK09(QPD
zn*l6V9Sk#U-fBr2X<3#;OU$mzS0|PA?+uGHSe~%(aL~Dhgk1XgEGO<`!$OD@%IXWl
zEF4sd6v7`!DT9BgKqgU~-twY>rZ%vZ9TX#iN^@NnuDLNElM*c-+#Rg{W>H~mm><A!
zuArgYvd_Mb-xm2;rtXz2MSoMvw_gBPG_alWuv9>Hi^UZ{rBikH(p3+2-*ifBS3G79
zuJY{h!y*L4o@bn)!k2|g4p5AH5CxGG2e%9Y)z!c2!^>bOb`e8>!gL06x8b6(p?TZV
z;Og7h<~OR>PNG%5c&F#~%C)xgS72h%G}vE<R)cDFqY!LQdwvu-gl~}L>a|CLH#zm$
ze{Vsx>};k{77x2aCES$4M^TZ~k-jOrwPS1Hr~eeG&-*1t{KQx2iKO!vH<HCBX*zqe
zaMcO^7O=!wteiQz4@#?~EfI~?*+Is&F+i27Ys?oex$&aW!rfOUA3>Cm;(fsK&^F*Y
zYR`V59%mZ}7Jfx!C04SxjAEQok(A?dD`O7fw}p5a(j;~c@vGze2zB}KhfDSd`T#Be
zQ)sK!M>|tVN;Zq03Hxw&>qXg=>X>uuvwg+a#i?1g#ap0Sx^eJ~)-QUxEMXhPTBEJi
z<Cdz`8+#2qEqMkq#)|gX49Xa}$yaoW$4{9vpRE;(!y9L!L;Q=4)Bn-=k;)5NCY&c@
z>O1f!rL8p!ZkzWe$9D8YPX8x~j)IrR`F9Ez3+4;en_B>Q@XLFylM3I!;!keZi5;}>
zzv+DzFlEWD|2~f=sQ+Gu;As3GjgPieb|N$R{>pu7(Jvzf;ufg9*&&*yl2ZiE_!mkG
zsT0}7k)2yREayL8a9Bt&t<7TRFS2E?*?YQg97CM7SJKS|BpF75c=JO}mp}?CZ^!L*
z49(dTc6v1zT?8?XJllp?P}0yxjz*JHI3VgNt;@wB#2iSPgB!$B7)j$vQkYFlfZFI8
zsc6E72kd2XmJ{>^vn8%{OZX!+ku9&yLO)m!L|eLSrI6S=H}i+H+5*l%RL7*~g3F|=
zac2O1qKvr%eDrD<5zCU~G-$`{3c*x~Crg^@H&MJM!;)^;rx^teccsmbq);{IU8dE$
zTE6l)r)J}P716@XK|?N8k1)nnzzwz2WBmipYBg^^yB#G5n<o5SAIo!4%Qz7kapVr`
zlN6{kw@V$h6K3kxb!hUt|9WJ(bXYOt&E$cUOGVhspy=z+HP_>;gQ3hvM8VGDZyh)L
z@S7)6zxtUx1Y5NVVCrFeUiagy_Qw>HrUTy$K-S%~HV&&#NeCVH9q7I5+}oaG9OX70
zSHxD{^(>gygi|Jf^I5s+)n_h?q>@-RJ{k0jFgtyu4_ApgZ!1^0VJMGUoyzXsV!okm
za$pb(YbON%#w~y%$Vjr0Y`T%>!u|S2EBvCvZS}3$fmDwQ>hu%P;8T;t01i$0(*Rlf
zp>4os*Lp&t+v{wH(|ZE*gIpFUa|nDkpEN-eP0sth*F${eKbwl7=*G%OG;9j+M;ret
z<(mu2$mrx}4v6D5geYfEZJLa8F);2H89rxIbmTrOli|V9|I=yY&UKG-EW&mVRQKr>
zmNz?qN+?F0AZTN9|L&KfI>zI_(H|3QLwt6;002mkfPZgh{7=Q&zcAMSbZPj1&>dIa
zmaZEj38$`Kzo;E&{uG6ZTuL2pYiG;57>COwsl_HA8aeXt;tR2o2od)J<QZAB{(LPL
z?rh;x42=;tPK>D$&uV{F)YMeejF62zn9d}Mv(SuRY#m-5U7Y+x$+~!SZQHYGPYKVf
zh|9MWOp{qV=r*y5J3j@cHNkL8-ydFl(#m$zoVz5?Bp!G<C?*l6FD(&{xlZG*XI4D!
zlyGtF96QJBP6;PAvAJhbM_gpGrcXW`hiIDF3f(`oj)*TDH(N2&B`R~EI5U-7)Gw5p
z$lilKa&nZ7bIG@WGP6`CG*~2RNL368Bq?5ck;Rqlv*u%|s`2+uxZ6`5!vUWj@YCF@
zNFn)okduz(hK`xi>3mL1Puu)Gjj_}*Mg`zWbZl@Jhkd+KFc}GEY8?ueYuu>ZQ6Zc{
zGGw6653XVi*ht;r?(fK98zSzrr1YRn#YkH}z(Tg<1-k%MXs)zW{%TL+av1CG><v?>
z^1Cvmk_NA2Nk~~uz}G^#00Bbwr-HP#;D~eT+(KOyY3?*-x}umobZl*IO-$7-j;4u?
zo+ym8;+BL5o%F;OpoG$g?tVjK(nYD3_lF?3T)Urg0_ZCngx6U|CsN-bk<EQ$ITt3e
zvJ*jC6Zn;Ynp{P0_s5$=#Xzqw6UCU9?ptxuMwIqG-K|<DecwaoyrNpqA+dAs236>5
z`s`qB;6yj}yuBjy@kyxZMcJDwH_Rk~PFN+vQ*910#DQiVx}KJ<bPZ9?P>lZ8oCW}C
z{E{}eHcxBLD1d))zugP~qrqZ{R#gq_wk-sFphz7c$chr)@G^hk1ME)gnQ7uMka}Zn
zjo-*7A#us;8^y_h_Pl#AS?L@1QX#j*ASFzA?irBDL~xz@NvV5a2C?_@G6{gJV=rPX
z2iT>q5MbbTyI0k1dEomA%$zs7!hjW|Mj(#!yySOp%5;tX%+}wv_P*)q%GO%l*y+Z|
z!XLH(johBs)SpI2c8{)~=PBbBUpF`2_BOwfuGHMGok!5m8)MlX9xk4qAl2NjqZ!}t
zM)%jRBh^x}wY9f1UoYF#SFKd`Ro{o>rSCqrwV~5bO*6NjT|Fn4bZuOmSeYGzyPnG|
z@1M7owIMc9)3v^{Ot!+niNXb>bKytf*a_X7o*_Wyf~%ejp0E-&DG#_;S;?P8syR=%
zfvrB{^av7a@cC??bUFY{J#o$E(E+0+fO>L3)B7_WIE$H~?6jM5NFMAikB=icy28&z
zzEmMnUa4mt;3Wiy>_`j*38*~vA7RI6!}qhH8NVu(R6j2n|Cs3$K3YIa)FmA5KtVo&
zBUw7%>#wQ5(NcaQv?VgckmCTvc_+Ur314xzu_j8$<pmbwZg;xK;b@%PGFp`0lg%~p
z3e(0Zb6-TV5)p{qUEM_hn7z4g4<b2N<k7JZaNQpH!n;OY(<ZnMbucvLgFF)G3wIEj
z!Q@0qeLF-A^Hf-Zw0$w5ARVgm!}#MNHeB+KZVF)m5AT<Y{|OSh0RYV%bT>pRSK$?3
z_U!7P^y>?XU~>mK5z7EDZU1odva|t@LXcsK5#(A!w`k=o4lJW)-5{qM(1ESDMw94n
zj>WzLMfu(8ud5qVe^IiyFl#!R-8=AcnNMl@M}&mrJoHf5YH6>ImnZW(?DNIl-S?Qj
zOM@@B#J7=-iRg$JNke37Z6_mGyj0*2^7-?(%JXilE8}A|$x<*hw2d9etTL79$4TlV
zCtapzLE5$5JQzBb^Wx)??y$&5QJ=^5-hyeEAlxoJNBJ$tfUh+>EPHsf-Lp(1-aTea
z+jEJn-D3l7TxMmNpO&s(+qJC_%_|x1kQ_qmb7z5fHs?h65!UHr$4gYh>~K}u0sW~P
zKm`Dq(3T?B3g<gmiBATdzkm9(^3`Z%h4`K8yP$H?`$$unz2)OmLm67H+N60IMql7C
z7+=%EqvAK5=QB3LlcJXU)_LuUm1E*cK*rW;!P$|K@=D$GjO8wf|CEEJ%_(J_Z!d1R
z`W+zW>=q}#fAO9%ct1Zpj4%jid2a+9A6N#jPiq=1)YWZtbci2*l-GlovW(5*_4Pr^
zx~G3q!7!y0A>z?r$*XaN@y2DBCtg<MYdR_+FmwQJvlk~J8%QM3C*$P`#jUYt6N`cK
zRY32oeV<Xrz?%{fct-ilbUoDQcL*3@kkcngtl?w;@+=f;39%$C1#^NTC?$afjh~=Z
zf)^vg1Z&@!Z;U`>!Z>jcw-BSGgJ6c2E0ILNQUo0SZvg3l^Ew(V;><c@d(V5bdPF#~
za+61F;+8jL<9dScAi?MgCie>InL#DMd#<F%k%JFRqa5C4?5GjofWa<eb~%7H-HT>!
z{IZwVlsN}llae?V4iC9WLI9+#V+(<^JCHb3BUSd^KlA1szPlQeu0+E7Yewd>GN8Dq
z+-T+w%#yejD9p3F>SZ%X$Y**m(UTRXP{G9(&fdVG4pe&5buJWcDWt?|`$TC+o;nzD
zM5%n|rhb^Cv~8tK`){;J^4tM{AhsZ=i3rBTr2IDvAY7U%T1{rm$C$i;7Rlt0kw?*L
zdE^!2ga<_74f{}TrdOB&w5k|K1YVirXVjntRyx0TMMi*bW!SuP*#<+7@K=GDLZyV%
zlV|q$Ii@Zs@W4k$`}zf)%USZi^YSWt59_o|nf#NeF|Zn#MR0)XIqALneu19nEL4RE
z-1jhvNi({oy^~3gWUM`#L{Lo8PFv}$w4vi0dz)Qw`=1a`0h1$F-_rG+aZRNsx0kE<
z3HM%PuoR}};YK+J*AsvX8XS*QARRdAy-XE2bg=gwNXBst3o_x0%b^_YMT0cozS27P
ztYfJm{$Z~m?zb}A$~bK44R<jaA}Z;Kz$=3mXR5%Z=GTOwFrNnx$gMrT>YaqTku)1C
z=CrGk8p8RNY3}fA_uSK{+g)^Edig0lGY!a91#JTL=!W}EK5<+NNfMjOK`lsHh!4VW
z!$99nc^fpEv^_fz?EEegJbLl<hTTl~79F%53@67k$Rf-<%8V~H6D1C-T*RV@uhcSw
zr{aib&|>w-)kzL84phMf4pf{Yz%7iBJ_~|g8456}X3N|claF;T9A}&a{k;;E9>_#o
zO*33Oa9fbh&kh(gYm%xcXA|%>w)g@CJ~uRI`myql8gyc%a%C~&=P2k1Cc9ukH?RT(
z;W6+zlwE$fr`!qu7B6+MGjdYpQ1Yy4vpzX|Oo2m0PYPlW;h9L-9(b>*XD3hV8{}hN
znx+v?x=KK(Xq1)(@QWHZq{lAQutxpz4Gw(uKszzJ83o`h9J|ld6_r@KqAj%aaUUa+
zF>}Tyo#5n)(g|k0(X<L+bDGGQ2$uStHk>)-xp~fo0Bh=7PMcmy`|Z*a1F>8-v!;VK
zZCPR0i^Z^2dll7q3k%ZECkL`8?8uR__EyCGTZ39e&VsZuGKiExIC#vw0cDNKiuaiF
zK0!ek4+D)w(X$p+xWY_x5x<3CuX>3x-gOB-wFqa+MIv3A*Qvn{JWbl{N6tJTU|vCf
zVtZDYPaT5#57gs$_Lj49SQQN67XA=x>-u2(8UH*eulTjUkJ8bP4GC6g2R9uas#|Pc
z{6=OpPX@Fl>fP%)edK+Rqv;&oEK4@^IN<kzSb3P1r10^x-REN=`IKvt&f#Qm0PI||
z6|41gT#98DyHJZEzZRQ$SQ-;cM?9@|TJH5h)Lr>fRf<ORL->dvYZNR7|M~z`DK9KZ
zvMDhxW|0hon>#-mS6|<}(!14ckwrvv<=9neFQv}IoSnqvkAKzHO~$3Z!IQw4tLL6Q
zo0$3NS6@b#Kaag?OC=J56C;j3@=gtrDK}C(Xb^+cwtJvg4+^&?>CEeJ#Gr!`6JRQ*
zX!C@24udDX)Tw;PI}FihPGk%1F1PJrId$BtV7NIW>(EGer!C^-yhNBG&10VurMP$m
zP2*Bl+LBRL?Rtb~BilfsqQpwXxpHS|_?2;84(Vdk$Vn9;qePvO7dwmo3iH=V#H)1n
zB)8l?fcxk4*kvWDE?)FD_ApA~=lKM}841@?f(f)am}TgNAr-l_Y;IYs6XvP}M>%US
zP=1JqLk3Q4jBfO_vL&_TElj$S>fhyLx1Nwdaw?84CtEqnW?&tWGNh?B6_dw-m~_lR
zS~B8oa7tzHmigM6^t>98P@HU~70{^?X;!H+L6chTFqZ*V;7cb~bc55hJf~|G)JA`v
zp{XEb?YyjB8m(|~SkL%z0AD4oRH$NaQBn?Ds{MK5AKo#^F+o*3y~#?ii2eiHHnjW;
z7aUABG}A8*<UM)xr4$P=*P1Tpvwl^|%_~tbuGf3EZs9JRBA*7cIo_g)iVQfH91%)d
zA7O<(XdTrveEgoIfKNckQIUi@`B7mp%DO>;UeX6M(BBu_@s6kT<twBD`rIqM*@>yw
z_J5uTqFy!(6HWnxwtKnv*f<&u2H6N>5DJb~QEWI!VCZ_Y$=VPco66)r)Sjc)C^>e1
z*Z3KyUy}+?u#F~&W;?WQY9wYYDa#hHLl4(jNZW`x8dYjT`w1Z%NyLCkJc0>R*TGU+
z4M(zw6~lu*hhyrgOuaYARpB7IRb!nwNMziAk)bxr)*1XpPl;eYU$&S~_rvj2l`75|
z@pDe)b1_i0Bulo@<k#%bQQK2`&adQVX^%XC!{v4FR}mG*o4X`<vrnE;Kauz_BA4ET
ztxKTtm3UQ1L^XHcz0jqOUwRy5^>&D|s~hlCNiCdjfDwIhq)B7W{8F<4jSOwXMc2b&
zS&O}w!Yy|`t~Jdgj#OS)kKCK|pzq^RMUF>L{I+Wq6W;<dlG2zhPKA@aLzRY$I2MC_
za9VZ+KiutD$K(^gr54?5jr}QTF6jI59ch&~TZxkgL6gS<NTxz5e?LFRt%thN<jr$n
zZQGJpc1=`NM5_fK0rBQmX`1aTY4NFZ5<wIi|4NqRpCTbzs-U>G2yp^TIy?#R<B#<e
z3<7x6SD_f&$w)gk0S!3tUNdzRkQV8u=uaRI+e!3@k@#)m!u{GY77PfeSoB^$MeUxa
zU<-XVq-smo_%LuY?Ys6#ptdO7VR)S(Bb;^zRCj~-<;o%P<qkEXMsbsxiPc2HKQE+5
z__y%PQJ=g4w-%%x#N`s!7$_PMWE`qG%~4D_+odMU=dWI&Wxn<@6^+mL4;vR<Xj`^w
zI6-F8hhnEg)jf}%Bg2Vl!)iq$f`b2Kr8qW7Qs$l9r*_w7qCBW9-tP{jWH7tKSck*C
z8E~Z1Jo-8Ccxo$m837K^SFPUxa}!(2T-;D;K`3n{4hopka2snCuE}ZlPK0)5Z|?G(
zILDNUZ1G(fb=s+NL*KHXpb_gw?5#=#8u*Q()12MKA~B=5FF&w`iG*o)itIz7CCr9Z
zQrZo|(iPCb7+3NUVC~NwV&4LLa}Br!vS9&euZuM$P$c5G#!4;8pkbH3+}AHTS6R>_
z|6x~7MCK@N5vC(X8UoeN8{)1-Ip7q5H>9C8FJHh5wh98g1KFKtc_;pLEtfRpkjrXH
zC)g7C0&S<;#h*92Qn0RC$Yz?IV#Exqc=@=rQW)4AfaQ}1UcTY;+eAMkmybRff@@W6
zc2`x-CAtFQcPyJOQOIDf-mVJoauaUnT2I@mg-ZUMZ(Dj2VN{tX@cPZR<z*Vr96)b@
zMgBSFZ-44)3v?{7_LY)__n7D(hYN_)bWLSt$5>Q8b8Np&b;=}-?j<6~CdX&<^P1s|
z!X*ynDb*J%<&lcd4>uQ8*q*^R07P+2vyULDEF#7^Bq2FFJ`99PUA-kps&&<^)^rc%
zz7?3NIq!4E1XsZFhVFpebSL$ACvrW*C><l!b66nP=%lH3rmJ;*(ixZFTfV|Zej2{N
zD&i((`*xh>@r;=AH-^6*_xeJ|{XI4hSd0uUiZ(CjAk1Sc3k29bSi57AQLR4d{GKRL
za~2Fvoy%tK!Q35%(<q7l#RP85c90TXnL+tafGZy)D7;{!cE=yX&#IF+l65m#GE@L`
zcMo|@$t{ynK5#MeQk;wnR2f(`8zR{7BS<Bhf_V$d<6f*iks*;Vne}4KcK0Q%D8^^L
zOghFc0t`Fx%P}E*Ip)s}no7?WNutc(M<}jY@Qxvz4kmn`?&T5|y1$GX;7%)=;TM45
z!QHsU(;XLOrTvl^ItpL{LKKivW^UqNdJx5K61o)@A0|k)AEnDKj4bGQsZWNA%t$;%
z8H;%!RpE-b252cF6M=b7BtUS$qY_)u?*xqloz9XES^@;dk;Xf;n!CoS2b|xhCWvYi
zu*}gbMEP_f*7A}IrWmg{hE$8(B4OoF2M#eV^TEEkJGw?}TOj)_AieE6n1yya*YKD0
zzRpWnlO&tp(&`oMpzP>oSHik4S54gqwFJ^t+B{dxDFspJIOzHYpsi&a@S69oIY+xe
zhgsv+rse{wIFNx`MOy~DL<MrBt67)}<0iyJhC1&4$TXWb!1;T0z<}&{t${OV+5itV
z<g)^tD~mf<YZ&Eb$H!7^w_cIFHvcu7ZYcr?Z!wj3N*W%!hYKP>8!h>B&1FjKJZB}a
z1EBvvi!6sI<7_>Muxgli^vBk%)xNReC{Q8U_@Tg@?MAisC3drJ9vS>+)$HoAFMCrq
z;U+124n#D3qf$8)4kpuvb-f8sqXmLsY#p}=Pp)1=D;33ak9yvYB72+9Hhbr|(-pmc
zdi-X)R#=9I>dae*90}!<{ZR|;o`#M~5>TNntm>)WZ&F`$%U?*j3)A-q(zS$#?mf90
z6+G&rW*DdD^KOy99}4WG=&GrY?rk4Xx=&npsMJb%I>NGZdpxn;`HDd&aHY=y8fgsv
z#yzG%u%T|f5yvPr+*y1C69F8zt5{rLxX`PYc6?l2B=0hAZ>#IzsGz;7Thci85_@NH
z6mQf{3{Q|TgiMuJ0dpy6uiAJb3w_yUBoyae%*xLKu#t5rpJ9aLWslixbP{u8exYrI
z9R&#m!{3cxN9znU;5eWJ!P;LvxeL`8$bqq}Vq+<RP6W#V#%*?-DMWSl3&`TR2DgeM
zzABBiA2TjSa_v>p4HNOvCmz^Q!ewl=#MmXfDkY43abD0tF|PJ^B`#L+Qe%y|KQ7(9
zrea9D0gR&zEKnoHZ36kmplJd<^{oWG<${<ihJ!6jq<Qp?sqB*S4KNN8u52~OOCDa-
zs*S9h@g_$iC<&VFdwzgH+=XJG8W^-Zk*T?P7Ozb4U8e{OD{K9o;C_;mXEp!dqZi$A
z6hMsA5c5a7nW(9%HCcvrwhRxyz&2{`=`kI545Y)>miqe17p;>!O0_78UBBKp3>R8R
z$hEkcX|j0d5$+n)!VFIx=HP4A{fd?204;m}Dw-D5S|x-di(*+;%lyncH;T~t3yApT
z^LT_KI}IbQ<eIxG%1bE#RF-}d2BWjRP)o(4BtSHu;1_7lRzV;${eC2H23YV)mbF>y
z|77E-+${0UJ{X7F+BH8;8mr(;j)JKa79|u-aTYu#8Xe)RPuSU?m1yB;eJr|M9~t1`
z`o5(>K@(5fN+*45YLIcPHd>*YPpC_t79tF203e$^-PlEtTWoUkqQpQ+zzA>H8}_gq
z)}yOp!tFs);YjTOJVe18G+MgvO0XB0*S5=wtC@Ans=c=pauYo14dmz;#!bMJ!q$pb
zku{#%9Gu3_Jj8W3&1WN4I$1?Jx1ln3slCd^hQjnt)4+K(Gm?+R*gn6(5x#tCj5Mu(
zT8{w2twD_h|0Q%d-^m9Ly}wze@D@}xQq&{{1xvQFJu&@jKGTP1ks8TO_fPp<RHt84
zRfKEld=8qA7U+`J+M%DAFyF<DaAS}=aSDJj;V!O26w-m6>iH(C*?t*oasVUSb5Gdx
zOs3axR2U4DyjMY?2~X~?dl|;rQJNX{Q(m@^-jFe45nI5fs@_OTOD4oDP!9}U&+m&>
zxN>UvMWQhi_)={94HB_a<e4s#FGof|@jn{G9acEUVR{mg6@?M?>40zd$McaRjLwSq
zSxXp|e9T@QH>q=t?8iqPKn`h}hpzqfTTVAT6LY&#oCp<60v=1uTMH)D^Kv;1^tf5I
z4Kxns&11a75t{&(Eh^a!-=`&lX6O*{`8GRjE~HGUKePsiH+-*mC%&qhbDO{Fo45y(
z&Fu=STQ4!Z(<ZCwAsUuPej8qDvXJ{FK7+Ju;59ne*WkLJpb8hP8WwBrsO;X8^!%}N
zFz<YffN9@$4!03ayLhQG+&~Q1L!c?Pw#zr;EM(rj-0!MN`PzNHh7el=x3ETvi$a*g
zt@J0Y4gInfMnYbGI36{p!Vk@#&9gH#lQTZ8{i5%l4iqblY1}O7(?WA46x=_JH8o3a
z9Y(E+X~v&7U#U2$oqq3F^2Cs*#xLJ<wzLlo%61c5&>F;-k5<&~IpgEnPKdXa=O7`X
zP*=)467vyQJG}3k@mEK#Zt$!+g?eb|AP#J>P89@8ejm4Q&^?5`fvPQr3wTt{MPup<
z692_W*$!)eK7QCn6Qx&~5)G79h7YX9StA@~g%pcx)hxfD*9}fEznaoD4HZkAkaS^~
zAgj5ovux;JMp3D;0Ph$eBP?Q6H_>7*mnrH&QRq4-C`7AzdaS>;vV(gO2A9GFQ7;i8
z+2L>-oz08b-D&V0XD|eCoR@qW4;?t5k7cw+1wLg37H{n|obh4V|DgPw<{rVYfgE#X
z&rFCOD|aK^5me2@K0KDdH@qV+FFBxMjte~8uP+w;3$BdfnLu^2**w`c*TkK{Rm4O%
zf0ihL)Placp!YUvJy11PJY|<(ZQjj@eR<qBYf7b&Jjk@I>xOgc1luSk@rxzM^UH;3
z<u)($Ro<T4-c<N<JinE?%YHHRX=4_P<z_tk0LJ$1(#O2m!tHd{!us-VomRrK^5J2@
zG0>6|o&b$WX;2HOJxM_0d6&bWh_7X>UKSZDJKU$vw<5_tK1d8EHN~-BN<3eyChi8U
z5o1WiW{&-99UfvaY=Hv}*FYul1G{t*A*)uk&e7k7qJm_G(<T{RTvXV^ZBj%VPJD%#
zqC~GWA9kLe@qVxQR4%k}*@QMNQy+GyOKwU|QiMPAgDym=kEt)r;L&BX?42F0KK?2=
z7bc6!r+^Z%(cYUSJYne(Cg{vjSDpK)4iPvwSR>y+pjTagT<Y^)Oz-g#j!HgLT)q?C
zD)7@n=6!M`e9n^J+lxSh!~9LbL4Ry+b}-S%*xMB`F;Sk$Srx^t0TIh0g9!U=DaeRp
zl!rs(P1Bv@s%JVzMkH16>3JWSNJlp(_Bod4BDp#1sFvJ%r*J6&nkcs0D?XAa)prU_
zb%Fa&ZD@D&UJeT$ZMIv~QrT2TLLE9n=&kz!(RMNcsaR&QX@u5feG3iF0b)#{8$25R
z(YW~Pmrtk~_bN3F`!8f*-An|HYMOgZfBD7tuJ>PRv1nWSl<hNaT0v2lUAm-)skur>
zoLQv_@_zc`827b_@gdKjWg7IN)O62U+<F|tb`+dqt?y1|8hO~a3W3$>k)w=~NZNvi
z3Z59i9V9vWJ9T4eT^a7WhM$LMr`t_E$wqjy%;!sq#GXXc8@)M%YnuC=Fc#HDu{ntH
zQ`vZRt1rItoLo!bji?ffOb_k&F>Ri;wu6{mrxCNjUTMd)BY=;#JltQ`#2*en4v$;i
z7Xoqp>A}mHUCLaZ-!nY1E>?qGPp_pCRjO9r3D^qHs8I?pvL$jvz_f@{y=j)5j}`Au
z?pSV0q!m+cQ9XdXnzeneo@O*kO_6+-`futSo~JK`Rj(c7h+0-0cHpfW2$1rv_1AnV
zAF^W|maR$8XO2;&hA!)0eH~jTbUSd%yPw@$=GWv#@LB$?Yu$t}P6Ly6HfwqisICk~
zxYFzprJcZc38*r+Lk-@ydSiWB;pxoGDp@VY@xdVTTdkZ~a4~HF;qU|Rj~@9#E9HPg
zvV4I@M)3QyeY41l4s7{3o)Qq{db>W{A;pkwFshD}%6Q2d$VpvaC0y6ue;ICu#=i%A
zsLZ6y0P2XhbS40uM^CzSiwKqT{lKn`o9@=|8V9`8w=D0)JRpprC0cL1E}}x8Em;|l
z{BE&#h6_}XKfra$>3PHp4oS}ztfTcCF@B5o&*A2!xaQvgoNZ($E}i27+Uc6<RpCBr
zNtTD*MQ<>{3?E@<#}Fq34O%4N1lsB)GN9OL&doJR&Ai3a0joGFyJwOc^tEy9C@laE
zAlp%$+X*P%+tSO+{7l84x;^DdnSI?u$~(L%2=B%toZg*s8iZUeG=>V=zm%JiiRCHN
zLqPo!{w2p?80WzNnh+nd7!04%;ND>e&p*0UQ~%fN!WF+?E`yWuN53WU1tp7LaT|w;
zGnJer{-SgqNV#x@)cKbW{fE>ma<j^9k_%2JH1fh=2)ZjR`2qsuC|Fs&5SOM?fjk5s
zYgJ@XVy6;o-KtNKwC`c3rLwc4Gx~*;n-2(5d5y_Z;5DU1LTjx(Xz083Eo+TSY%@GS
ztZKB%gMCe{1kBW`^+pIZhq$zBuDe@lOT4fUb*v2GyGE|nICKJC0-B9);g;RC8^oLT
zONHGkYX1S=Np`E~BbQeZy8EK+BRhSY;)^!?NM9Z*s7VuB^7){rWy_ZzN}7p{b#KHf
z;2w%SXSX&rB$L^cn@G#~J5_=7Sur8BjS~`V*MW^$42R{uoz4^Ph>?i}ifdtHCJ@}#
zF%nMOw$tj2p)opHiRA?b8I=<KLsHea$cRmoRuL5H6|v&P1o{%NOOJZx^G=9w`01rH
z3r3k;M2XjI)-)J?hgPzL2O%%JbT<fO-+T}`lz3bXC(1VEb>cx;pH)l)>chfbn*j!Y
zEKCU78|tR=Gx;;6qZc!)y@)n1cDlxL*I{XHmn!MG36Ucg>?a~RK82>r#1i+-VxFsm
z!s{OtIQh3(xHO9bJ+~kuLum|nA_3pYjI{4E?;qx^PooC3ivf5!*BMKPz#5$pjC85G
zYdt;PqY!^#J%_3?Li%D#nTBQplD%8E#Tu7`!rPEtEXWLaX&ow9;8W>$JpXK+W%Tl%
zm!;FGOhw7wucbeir$C>GtXQd#g)r<1C8m?&h{W{2BG}H~FejNf6b}Q2b6g>CAbBeg
zPD!<whqytFpmJTw&jGTgbpgB=T6!Uh%!+@-FJ4H`Z>x?uDAV2f^xrH@_-p(W#eM>r
zyJz^~^a`j-LVF7lphr3=EFo(L#O3Wm)Vh4rgrpE}UZ{ld_VwkBG^t)T@;Bt_JxZ%s
zj1Gidsu>3ZrY{!48jbJqqXJ=wVmfGkV3q_~Q{C1HH${;vJnLUYR4Px<T!UN;Q>vsk
zfCI_x{N^RR7ws{7-I~>>*2+m1Y-SQY0Pb}`j)Hw7WjBh8arP^u2&3n`1_AQ-tETo2
z3F(fGyrXEVJqJUBG-*l?W2}uFx0fN&L&8rjLpgL)Ua%0Z8S6bzO!_sVsupr2MHCOd
zNZC1H-xYcm3CgE}xBPKC{TOycFM&|y=s!lJ&#xaKIT9YYK$giZ6&Z#Gw;~_PO{a#W
zfn+%A&&voYgGG@gBIg$OsoJ(JlLQav74_;H)%+EFVWHXau_;?O_vww=h~scnGEn%?
zY>@b#HjfPGYuNeQl|m3@vt6aVnEJ`SxccgURCbn8ab$}Y1_Cq$hv1gr8eD_ByAvch
zjXMMg?(W*SLlfLxf&~o(mteu&3GkTA+<U{jv*zX1T3xI9&;E|}sjfbK_7+~8&lC^Q
zvF<(wqs<EPhOD6Bw?<=u6P3NE>)*t5Rcb>Jdkc2K9<cHlzmgqT8-H9=p9imA&^8S3
z+)X67pOtu~V=9wKPOowX#3blfY1sGL;#+%pGz}f$%voRF6|*kY5Iom4+FU(Z^WYQ5
zZMCwu>`{5Gy|Kx(xZqz%D9It?K40QW6x2ja-Us_3K2h;zF^w_)3jW-s{lXdEjdDs5
z*=DW6KGN_YN{i&>$SfxO)gUX)jKoz2JT$Dy{%(hf3NN}CX9i9hEIH$XeK#A2Je3P$
zZQwUVWNv#5dp2hWja3LtIJwq>2u5M&+PbEfOcP+WFyp)>%AKn5O$wC*n>Qc39Od;w
zW^=0Y$JX9LxAECz3e$`a*(m|$cq3bY)R%aUt{FvJ-zJ3J`p&{Xaw-GL>PoYQ<TP?<
zm_iOEAjk;)M^G#$t!!rP4Y$1GI11EcX%TD%>t9#EmBN!dwQ8BeBs<@$tdi)FHM6g-
zJubQ%QjYnJM6=6?yaxg4M`lLKz1`3!+;nvdTN-qu`-1HqE({yDwGs{J56Nk$1KjR8
zBQiZPn^eWfKYp+!wGa{jkl$G-x3~5!q&30d9H%}ss_d5?k5BR{x*fQXNjJiwfrj)x
zY3jW_&?6$W9My2?6XCFZ@TOynDMB`xTaL$10RoPVT3wnCGqX&fb#Ha#iE4OvUJ_^U
zW)5qW{y1S;_c*&5-dZ`?Ix|8D#6rN}r)}i#h#$sIvKU)GCfH{XByblyrqDVm9>5-u
zPKiYHdNvQlSq!TO0CK~^1T<fTwbg`HX{73W@M3?SpT&0F^5a@Tan2avDxJ~!nfNZq
z#0gb+T0(0Vksx?+a?#ZE687->%?WcVGsy}SNvMPc&S!P7vg%361B206I8J@B1Jir}
z7~QI5^eg^?@dN3?Nz=^T)%M-l>i4t4B!2hp(QSiN8_wF!{N*w)?6W@R&4K&2%{LZI
zRIrC$doFcV%aC&4&yHBRu&W7-;f!FIvW$=kh5&DG(poOS{UMft14pbMt0JfsE~0=4
z@w~%0TW9O5CqMQ(H@4r8(9`7p0E!7lt$oh0#(aTlnX)D`KDs=Dvt>oP^HJbtMz0oA
zj4Cs%=p*A@ueR(A<&cMc7;<j<X~|df$)o!CeQ=&3O(}6Y#xmBbuABKnr%T{{FSZ^T
z=4QPQWpRx(UKWm^LCfcl16>^LQ?)>i2w-*zkjpc`9A$J{2Riy(t=Anxg1vuD%?aJc
zRl09Sys69KP_~br98%c*Jj)vi9Lb-%Ti@K0TspG=m(~P&YY&&`!QK1t#mSA&bU56M
zk9TbVwuuoeQ=M0t9Um3nspwRt5p6@qyLQ+VcwT1Xlv+hfuhghUU3;Tcb*>>B<T76!
zL^DK95a$`s^#&D_0xwR3x(5yV?8=lGw-qxtshJ}AGxLjYCLi$!%ScpC;k(Vp7yU9T
zP6>&a+`w4fJ5{9%*CuDYvAyjyH6MWp%k5iSTAW-MI`28J=f-;!k<BNBD;Y8^dnX3I
zIn*X?^ju{>YgPd^buQN`?Mu$F&gi;?l_lTk-V=-0!WMm6nclWHf(QFYe?#yNHWCkd
zLuh<GB<2~OH+#5x#s!n-5o(E~^p>Ne2#Z&jhn4x7Q|T6~K+I?QOa5^-g}QcafaOQh
zbqO4Df^2$Kp?PsN{1)EpP3SvWj%_5B`qdIhfZYd{eV^JF_fE-^=mPkF<vlO1(A6+m
zn2n>a2*o^nX~ngzYdpm|=iYgjSE}Q9**Yc9EMYo!z4!&4R<1V=Ze<OoO~!G_H|X<>
zxaieNa0Z-3qL4nYiE*6l=mxadbwRsktEJ-%%W<kZNJYbdc!c9$RU@x_(}OxUrhy6F
z8cmviMySumv}yhocf!)yG)&RqYo<i%NR+G8s>r?FC0Pliq#wvC>bR|-ZV!v}QlyRk
z)Ur$EOCPwzLWqf*7(@^*0SV(iF93g%${o!o`8sc(tao5)ucXaqSDoftX`&{8JG~qb
z%6i0|Be0_fVHOBN*TUL}C_-?CK3bW<jXI89p=vCoiANRc^A&Uy<!bJt(x;Ja1el90
zWlYn4HkC#m4n^6^3bS9mEW4gRRwG1FPlzWGMvJwfyolvnn`yrEi!ePlw8595PVdeg
zcbT6RbyT`$%-%`OZgoFyqs2ZT7Qxc>def=6E~C9jttIM5vX}@f*e?HO+<Gs<EXuB-
z;zs!VeB;1fz~70$sP9}BV9@IEP=f%9>UEr>i4e~M0&=l7b}~LqHeSB@YVF)HP=73Y
zWp`aZVgX0bTFiG}zuLT>;GeWC6AFerCAYE(=hFZL7k2OolditshG3Vf{&a;cyrAlk
zSvgBH8p(S5(hpf1=2g&CzOO>d%R*L&oj7bY6F0;k+MNtC$?qAccmgHofR!GN8|;^K
z@B-Q?av)BD#*s82rU5@Myk-!1tV5BP1J=K2ysEOp1B!<w&v>R>2R=;&lLi|~kjKzl
zSG|c_Pz!mnVV*VpnZUP!Re$RjWNELjld_G9jm6oSzA`zdmJmtT^IT-a&=8(fy+d@m
z?fI&4nf#z;3}*fEO{Tce6yJhvxTeX!;Kq;X8v+45OnOebhi~>)EEZ>}8W0_Z(h4Ns
zdkSR#(l5$EV(in0SSr8-5}g{HL`=3cpV~Tu(k@i9-wu93{<ZLj_Tv+1!X$v8`vNyP
zSivNe(MSeYr7Pd+aSdaorZG`YjJ)}<b_HAGkP%;KQT9Tb(jjOLi^Aut#z24^Ko*K_
zLXY&Yx(lbOY#W>&ZW2Q!4az-wv(CC>JQsR4zN6kEU#V$=QpGJBY243jzS;XK#`awV
zo^pcIipD^}`S?p^{2ZW~d9%)gUQ%$7v>_m55`{3a1ZYT$#~h_N)5$F^XHl)X=Qr4C
zaw~3@{q9xcHTO~AQUsza!_xIcW+)ryM&&wX0%(0-ck*i}2j&q<x&ndNr?A8P%uw=Z
z9S;AI&yJmrXr1s93S@mn`_oRgq*)d$Qo+s86C+IN#@5iLiA`@KRCt;8FZbvq3V3$x
z^a>b`<h?m~r$_oF$s0Xcsy>n8Rh<M_O@&iG1dHVuwN|(43xbeE-W@A`XK*7U%iOWU
zX7Bxc(KJ3GOJR~Oyk5~-1kfJ-s4YHf$B4?7F*8}3^s129?8?X~<%%{>hdI^}06AG{
zu;yM-ie=Tu30n7_1PyPK@SUF`Rnt$Dt;bkVy$33rZBCA&f<^>rwzPc?^Rfe@Y^nIK
zMlDGkAag*H0WZFku;-jZ*SFm|pmv>}-t_vt0%(6iLC-F;A*r4aCBvz3n<s`ItDEDg
zlnAc0T3UvffCuBs7=?q**DMEThIB2_t8fKB%`X)XT4j{t$PS`^Z+|@|Lee2MNKdOm
zh=nDHNHPSo&5X<s_@MOo517wB1up%g-(*>I(vuvvdier;$zhlZ9Y1M!fiVRY{U+3q
zWPRhTgCs76zLqb0RVo*Mu@R(oi;^2z(lr4TQZ7f1Fo=KG#-PSdT-&bb<m!%+nXF`B
zUwq%tOW!DSLn4KdR%CK@LXe-H8#nala^le5mTm4sb{7;h!(k;!TKR61DMQN}@%&V2
z>oD`YnelHcXoC`cF`r{q_e0fiD16P)o*#<6BtpWBz-v0;4dUwO_Jw3&Q-rJ}^<inv
z+EYwDl?WM+Ts?4TjY3=9BOA0-Ef%HLgH!IBMyoy!#V8DqGXhoHlUm#oI*d<SUNiO|
zaDhF9_aZ}c!OZ8I5po35-?H)ouy^)CHUdvxlhmnDX0Ek%_?6EmF;&`<InLJUYb7SZ
z&-n}*l#}fvn^t7hfxn9qdJPpmm#_#(LHeHTwT(rz`1+wKvc8gciaUNr`@-F3v9q+K
z&!vjr_oe2N8hn(1R2o7D<ceE~Dv~#oegw~!EiLVBvBZe;VjQwmMWU#^ZFe*xg6i9L
zTUH5y_u++2qP&;ync3)CcHOQa`tcBJ-zy{um7$3X6lB0Cpi9Zie|1Ajzbf#iO2a(D
zFuCB$id0Ap3-gdF5NWU|^8LD#8)&L4qZMcJiP^RXWyeq2S&E`rie-~wU_9#exd>mQ
zBHq3zn~tKb<Ep6Hd`UTryD&SNS(e!w_*m;JDS_jh62L3Bo{)K2xul8kI4h%+Eviq{
zvdB|KQlRsG%^FqBAHn6b1XVb~1hQcT^inu{Q2JkxI3&vrN4zunsL!Q|I8u($H1o9@
zebi+DXsM0F$ygBeLBmJ9!Uwh>$E)4ES2~iP7ib-wnA)!K2%~QNPRH*zsFc;WEi7P?
zHXKVH64k#MrIpBH_@*5_<`rj;JICjrs4U=f(lsGF?q~<HshI*^i@wyel0+_g0ls5&
zV}~gy5b5>!BKpECPQkOwlNFyBAJuSz6Q62LA0wrz+40<<Z~G#BB!U{8S1=+_Y~Fas
zv+OXHxDCkR{Y2aKe6v`4v5m?uVe(KHnctm1v4Fu__sf+3+oe7gG=|Y{PL8Fsx$}jR
zx1~(`iLb7<bQQ6$4EAckn>leo0+cT-rjAUmdjn}z^L8gs1FP{E7tJs}|5!rz{0c#j
z6xlv}IXc?vh<_YGHFV7Gb%xBbMFOH!kyMH*v0iO7psyPeq52-virQ3;fqW|O-d+G>
zlElA)1>Ga(Mq1AZwk^;^Dpqptg(w=gjg9^~=M*_2Yid*VT|U9Fkp~d5kv2(x(#sj!
z^<{tSbtoK7CvGlvFOCZQ^M0UD937{$6qiD(8EiaX%@2W1^HabkC8xRY7duy{qu_f3
zXUb|FhU09}IF3z$*?!CPySz7s<Q1jy5-cTtNDivp%4;FygrN(OXKBdYc>Vd6JVSP~
z=i@~;{0%t$wAnL9D3Shnjlgj*wx;x<N2{aWBJ{iGGR5%5qVQq7ws7Sk`z<EKWjak}
zuJL*$I_&UefdgRFp@sn=Qz1>TUuAf@w_ONXO<8~p*Dj4*7EXZ%tju8?KkEJqhFsRv
zy6}i79g(`wbDqy$+hCIUpeAkD#K1&P9W!ga(p~azF>?uKNlp1zP9o!>y*xF>a4xEm
zr`jpOQOTBW?83%|ZSp`mb6IQJ%>b(o;kV$p5NU3c!7>TwGwK1|)JpN#K$>}K>bDL(
zPPEh0s7}D?I2E-D7f6Jamr=~mF*kIaO1Z00!%8-3y$uu<-mtZsm(oCf*cObO+<IYY
zQ;Y2P8lkW(#6**n**1P`AO2tz2uYG}+IGP9qMNRiX;~hd$vX$)_-1*dV}<?{OVri;
z8T2VMt2Y@p@<M?SM1bZR$~6)?&2#e_vcu1%z$u`qPD=*0FkMVF)bJs<^PSd7S7@T)
zk8lijS?j%mbaTac!2s>hyrhJ+1z81@ySMifb-dLEOG+1}*@)kp@+*iKhrHY`no8cs
zOO4x(^`RWu1}~Sjf3$aYMuh9Z4AAgHPj`_$4@PILi^Jbi{}3^2qv;Y*MOhF)Q$9p6
zhf)6M2(l{WF+Wve+hWvKR0q4$<o;s~TKD4Nq6~W^R@2Ks7ze;QR`Al)LVGz+-dBLk
zdfiT}x*KJZSu(E+$7U?JMoZZ2#oGxV|8G}}$6b@=wFqxTjtbPj>Q^*%s(<b&04f)u
z49T63=s44$uqb4$1xs;+Mav+!3tjhKt;Jf2zSwE_9FH0g<=g90(^WCTscW|w-rBkk
z0&~>nJ%12On+lx^H!{fhI5-f2#qw6rd8xl<mf;AsI6ETk(eSRnS@%ImNQHjh<Z#hF
zsWBCcq*O{D3c;$X$Wpu-HnJjtwdky1C>~-{eWKJHxu%)!UfF2JGknNZ*zcImFF{w6
zf`eRHt13)2<HtnrQdWGgWD)7{M;|W_VC%iXp2;FxP?{_<19WgF%htDp8PGY7WBp@i
z_MqALBJRb?j#F0>#Tat?RYIyg0OFY>C`Tgv5g|y}^$P41QF7DrlaF-CIb7qF5;+Wf
z-rK2Wz*+BH3am!&i9|NJBxqcmJ~{%jx`&GNSGb-;A}uun1ecCNq%2_r9vDhbxQHsh
zv}E{BzKob*{Iny(gg(i@kHf$7(bN&IexSk4OzAV9t_RL(0{(_KHH_YC1rAUFuk7t4
z`@XVEI<Wev;#}U7!l_Ki=)yH~e`h60f}%I5ZlpE<80H+ZHz8SZ(}w<ql|hr7HDzqK
zk=gpHX=P^YgG(XF5Vsl#uh1`=Vt1dI@#`d4z&%hM9TcS16X&*vN(Xu?qKfy8oj{nQ
zk=~Zc7x%Pi_%&2H%x&Qbe%w@$R6`;C9>nw3FG(-Dn%Q#k;y-k<2g{dGfI^$m<r*e6
z&SQ}t(DQi#umw<nn=dju29o3G_MC7qpk6jbimL>Q<!>zt6B>I+6fDN!HVB8QfiB9q
zRak5ml+eMu;+d%4*L|tcM{i%Lu+c$t%avjEFt4(;nusm<$qA-R3<!zNpb9R;`1vur
zoG5S}d5NZrT?oLxaT{1o;h>kD_&z{1XsqS#4_kO3V3O1Ox~ll*@t-h&1m1-W&U|>R
zMAgQaZGzU+uz+Jj6(*Y*P1ucb0$kdt0pWQ%4Rr2C(t7OcxJX_3OKUf{Sw8Nyj+42~
zmUzfDZtLLvq75{YV}U?Oyg9Ou?vl}m#~)Ch)%EFhbGG4HQ+XJ%7v@yEN6M@B@RPA_
zQ65B@n4&I`+irtEAbE6|zDwZ4O4|DBV=Tnk&6a>$j9VRE8!4`oNQQmFmUuNZ9gUGI
zaa>Ij5>SHF@J#eglePdsYTa?HPP-~@Yebvm=1l&xnWUX_8R^M#uK5s8P2=Jc{k0W&
z^K9~v9#YTn8P4ACJH%^Td2|V?$9LV`r{lf7436iWAE-Z6roIXDUB&H}t8XBSEs%UJ
zy(hl4TP<j}iW~M(HYNy1-xwpDT>rtd0oF;o9p-sgU&_1tPfM>YS}_*5NOzqh5r(K{
zt-2p#T2P5L2y!n%suq%#m?ails1v_;QqWrWnFTLe;hteR8B4b7tOf6mc#F2CU*%_#
z+0b+_VP+BJu5(}(>_I@*XF)*Yz&ta@3>d30!-nsO%*GLe7Hm!j()80vXcBE%5Ttx>
zqd12jfa!1<1-;ZG_^~2LYejMW-0jhdthoQXr9(oKU)mbnMXUosh*!QMYl@QV?s`;e
zSrxg?I`gNQtoLxh(inO)iJeleh{8a!dQx;m#6|CMF*Rea=)g;smL`w+Knr<2rTZ^Q
z?H78c;*_~V+3(%$-XF<2+qv=0@n3x8wX)s9@>%wta4igiI;9X+KwjVTIJ+?L|K7pq
z);J(AZgo&XRe0AH@maW<5_VDDyJ-33;1CPxQ~v(`kcCWJQ<>9By&7PC!lbp+3G_^U
z2?pr6C;SX8ce5iFeFzC-`ujeoR|7f?6(Dtl&GHQ(4B7G$m{OMbuu~pN*G!3Fbiw5U
zi!TGI%Q=RE#7{7wPBRO=Ia~J<Gqo|yS8+4%z52^o=L2|FDfF%BKBLq;#?>;!cM9S0
zP6p&5)pjlTL`}wmFPG3tn%L#yJn`d4k7vs-PO4zu6w>94pF&~kypSPRK<1GJR^ZoW
z4_wr@FnoXGEZWsIv@7@A1iQ+n;k-DxzcE|YuQ&n}deF8Ju>W8@2G7Ie$B?X#o9;dJ
z@VJ4gm6fW**B&rnyD0aT1uS&#;Kg|4ko>C2#0V20uAy<;`leol)bTZs1(xCX$SHM`
zI0^=0NcVE^y22_=Q53fF4@zs1EcZac^s9%^6tx^#XQoGI4#$jrVuJ9I*VRM0YvbGV
z(l-8Aiq01n?%~A9euQZ|{z9Kh6rIT6a=C_H5TV4$D5ASax`w$^EH{1Jus24gL!#4J
zuZK!NzSH%D;aLcCwF)!AVc!$r-;+};a09)%)W?aW=)!j4_O&L-I$pAwYk41OqHXYQ
zhIFV$@Y73<sHX1RwZ3?S1(+Z5E(?z5*LJEM3qTeke0JH}n{r#LI!z5_r(DUgO?XGD
zSD=O1=L_HSunolp+^u=48ov7XS0HJ3vxgU0J#1x`PMz5G0jp5Tju)C;&ZaP=aHpN1
zNxdQ~duDdWk3J6CbgGH=XY3sA$`cs#Il-;fZn`Wwhon2KW1{E8u*AI1#}f&~*Uoh^
z&gE&?T?r#jNf87GNe$}{dj!9UTIR;Dlb*55^P{#0`EVt@6@H#g?3o|5Z9Y$j_xW=P
zMiVt1TOtknxW>3RY7Fm}^z%9PX<c%>>JsI<DJMjpVeKJg-&(EECSv3i`DDS8=1p1K
zLbCEmnVa1KtMVU7FrVbwy^Kd06zZ?I<NLxmQfe`$Ls{&gob|tGITq)+(|1{9r6S!#
z%+KXVrxl(;z^h9D!dZm+p)e@&X2^6!A(wBhFRzC#A%k~z0Fm=IMsR6X??&Aklo$2F
zO$uwPagr_ylb9}u_+xvM*n+Cs{i3_xb;%Khg_}k`K9yp1$t%~`X&Q(fjIoJ$uHx!b
z0V2~)LG#aMY>Kby?l@QV^DX?|TJBQ|R<={q1njMj%?;XZ_RWp1YeJ&lygpNjukrqK
zxw^HVNAX3EzJT6AX8YWOJ#?;{pB-sEk@gG49P&bgrD@hZ!l*0HGIt%1T!Z8p_wQD2
z+Og!dg4Betiq|QYL$DtVdTnXZdKT<8&T|sWv30Dl<Y*%-NmpAcHJf$eUa5;fLZ>I+
zxs(wb^;=pFIXSfAoUR^oYcp}NCkap%1~hx<(jc0wpZlR}G%&P=jEu^UR-cY<%%8^X
zxjNcGHkcA411Xa>XHH`~uZ+<7W8u%uW)qFVCE61%Fbceh&)={c;h^C7(fM=o#oytb
zWioZhDVQX_M*+=lKRcvvCKhJV%RsDhezrmxFaCNvBPIuO=Jm_)#Z|xLvUqU=bQ1fc
z?EzY@(51CmCB2tkSOr>+g}p;H6xc!IqvCH`lfQzXH&Nb`SxZDPs{NoGLK^|`v@L}-
zbWV^c?H8@;qY6FldcLzq;K{zV$HT2L>AQO=6~kDf8^e^EO`7See3Xr<<|^k|IromB
zO<t8Kxo&W3(;5|yd0S`ZI&}cTIZ?jsRs&`&kV(<aQ=1D|K!EL7#>`ZS$=0r<@TmY?
z$+}e_NA0Cib$?FxVIU7i))4Tx$vaRqu(_%)0^}(m^7w13Rdtil5Qe!j4}CwLp3npw
zJB1LGh!Ky>Cs*H+<!2rO%y{|&nVWq)u?F)qWcly3LK!Mi?L|dMKH<ABNW|Q6NuA?&
z-QNg)nAXV}cfE|w{NPNSv?~1giv&XTqMiRuJp6*_MInx!T263}aB9E`?9zqLST7TG
zg0i&w#L^H!Da4Els~=#Y2&1DJs5F+~jr=u>0I96pDnGv>mZAX`lUvcOBY8+Ov<QFl
z$DKKY8*W>#r1-iIENz@AB7dw>gQmN87N&rcV@tO7vA*nG6Vq&VaFJrsPLC=;<pYV?
zuo$o*5(9^%Ehr3MRRm75l)Fq0igYRmrRNJJ1dSOHtafE~HwEJ5FczazlPEN&0862|
zo3hLnA7L=2PR8L83aVMDD#HBZ8nsadTVk$XNH34xTiH{88?=zzQ?Py+gf=MuC=>=M
zz5U_G?%rxzE(^ID0Vv~bKbel1H}zn14uyl6fz#(Wm+{oYcS;B2xiUUkN=e^s@-1Tr
z9L*WQt!A!}D~$L2Pwh#RZ*u*=Jm#%Tph7^P{cHF7zmrz}udVCqi(&I(s9w{0OmB&S
z*xGsIV<engIzxgl&0n?hC9G;olUg{G>Qpv|uKE8s1l>iG&z>wcP6r9)P`a~bwqFk;
z-oPd<)4x)ONlHV!Z)w%MUnd*r?G-%{!vqUdU{)*W50K;O+jf&<{2&GP!?3`zriw{+
zPv>dT$Ubl^XiTo7CeukEq~!5S&f-!GFyN*1LdE-Y$aF^`Tz%`kWQZj*1cT#8R#MHi
zvpe1c+N+SHK1IkEPnHBul~`Jp+SNkoa!6^Lkr$%dl5@hCbkTW(BhbOUs$&wq`CzIF
z+FBNmTdj@DW8-S~>(|4oiwcd$Y>lO|v1Hj3+BSCpTDz!D8SwHu>+hOSX9|Jpwj9~c
zfCa&NG~lpE510+wTh+ZqKWy*Oc1R;fECnM2^z0c+hJlomTdmJv;(TZx7ly&`$*o(e
zD{(!hD56h8lxiJR9i~)QdU$e;$&2*!N`Zt)PPha&Q$Dc~r={lww=)z@c1QjjGr+gW
zKB+b|2<XdkQ+S>?&;W*FQ5-@K3)qa}V0TNpG8egScb2jP6~l235&j<?J7Ozc*P$VA
z13KDPEJg=*eb#}3&kK~mR01^8?<yd%c8G*!2N&>O7$dq=eep4o7H06^^j#i?mnwyK
zMeX4d_cU_bAS@2luxN}eUTeTc;`V~i<a1A{Z$?Z94f`#Pw)L@1%)HPQ@OxfHrREN(
zUKoDHfYp|cnSe2ihkMJe)Dg~!YDPZr8f!PB>|N><*^tMw+c&%=j<TSVU{7uVM&7eY
z;}<^&yBEh<2q{_~2;nGXcD%PO?4#l?Q15%g+Tj~s&pqomFMCXqhCM`OE@XoZ_b5)k
z1#g~KSwoBpjna==^_~tQ1c6a~O^j6Z1e;@ocrlV{6o)@Et#N2QM?o9hw_VINfT9zW
zigR}peuFKqFEZd_MP>=x85KQyJG7!I`6XYwMcWnNXReQTIn{mWAHQVp7p9mX6}pHc
zqD5}qi;-Pvy>}1|d{#&*Znqs-i^Li#y;~kU4~Nd*uxeH=Og#K!ul&ef;dX0SWVD4M
z;z0oh{@zenoN8KUFnt2C&yxLuoxd&=K4#qVATed_Q_WWqdw_pv&}pZG+kH$rPxL^R
zo?g`Y!<Izk?QS^AsNvaeVdCH!Zy`FOY;uS3Sft;qS#7rJJ2~P(hI^V1<AsO2p6+{X
zy@XsRE*G9@J@;pr?(4pUdLO-98Z(@0i5fL1w-|f<3oAPEGu;M_zholRK(toN6?<X6
zyGQ9?gRcgxx%F)GD=#{*n{#)`=*MIP^4e~gr#?Kovp_--J$rPbeLTA8uf={|{B?qX
z!1?P672;0tkIO&&JsDJ0U?83$22`s$xgtP7LT^3JCO=RATS^H@-{_CvfInU+e@Q*s
zT|(6O@c&E7+1TE}+{XInwDV7*|H&UXtW0{ZiUa`x^;lAzC!*>968+Qf(8kiz*wFD&
z+~NPBE*$P5HTvjIGXMtxf&4^W_|eYkXYCDbtZd&o{y#MJGFmL7Q6M0i86hB0pJ-}5
zW?uc&w6(W!v@x`?{6AIyzMuZ3`q%)Ejm!Gz0Qpn(Nxs8BweH`mBK%d>KT3Tk_~Bow
ze`a(2^BDL~cQuVwoVQ~_K#bTxKoCEveD_nqUkz0LV|n^drT!1(X-eXsWA@L1HqY@_
zTl`t-Kbqz5GZTLg^fc`6SCA*y6VT7vJc<7MJ;2j6v|j<OJWl}szGMG?+v;fy)UN~z
zfhUBYRd^B#^?Ox#8p!f1;7;TT;Adz28qe}i`hPz}AGc`zG3I|BH9mxoQ^L=I|7U0a
z_VEADKzbT(@@v_xo@)PWoTpJI|5W<_5T3ft|4QJKeNy_Lg#YC||9gn1_Ts-HD&(Ia
z{?2Ut_YhAVpnpZ!Dm_8`?Fa5pKj_~hJ@rWbmDHg8g!H$L$-l>VYMuKl14iu$<7c1z
zw~_AeQJ(q&{z{3|`ESZom%!g6JhfK)l>pZNZ^9F!wci6gb-wx)U}N|f;Aa#4>Vx%r
n%1@t4epRN=`AhlVGu}VmBw=9xm<;~-Vle!83zshV<K6!NXx=E-

literal 0
HcmV?d00001

diff --git a/python/pyspark/streaming/context.py b/python/pyspark/streaming/context.py
index a8c9ffc235b9e..975c75473214a 100644
--- a/python/pyspark/streaming/context.py
+++ b/python/pyspark/streaming/context.py
@@ -32,48 +32,6 @@
 __all__ = ["StreamingContext"]
 
 
-def _daemonize_callback_server():
-    """
-    Hack Py4J to daemonize callback server
-
-    The thread of callback server has daemon=False, it will block the driver
-    from exiting if it's not shutdown. The following code replace `start()`
-    of CallbackServer with a new version, which set daemon=True for this
-    thread.
-
-    Also, it will update the port number (0) with real port
-    """
-    # TODO: create a patch for Py4J
-    import socket
-    import py4j.java_gateway
-    logger = py4j.java_gateway.logger
-    from py4j.java_gateway import Py4JNetworkError
-    from threading import Thread
-
-    def start(self):
-        """Starts the CallbackServer. This method should be called by the
-        client instead of run()."""
-        self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        self.server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR,
-                                      1)
-        try:
-            self.server_socket.bind((self.address, self.port))
-            if not self.port:
-                # update port with real port
-                self.port = self.server_socket.getsockname()[1]
-        except Exception as e:
-            msg = 'An error occurred while trying to start the callback server: %s' % e
-            logger.exception(msg)
-            raise Py4JNetworkError(msg)
-
-        # Maybe thread needs to be cleanup up?
-        self.thread = Thread(target=self.run)
-        self.thread.daemon = True
-        self.thread.start()
-
-    py4j.java_gateway.CallbackServer.start = start
-
-
 class StreamingContext(object):
     """
     Main entry point for Spark Streaming functionality. A StreamingContext
@@ -123,10 +81,14 @@ def _ensure_initialized(cls):
 
         # start callback server
         # getattr will fallback to JVM, so we cannot test by hasattr()
-        if "_callback_server" not in gw.__dict__:
-            _daemonize_callback_server()
-            # use random port
-            gw._start_callback_server(0)
+        if "_callback_server" not in gw.__dict__ or gw._callback_server is None:
+            gw.callback_server_parameters.eager_load = True
+            gw.callback_server_parameters.daemonize = True
+            gw.callback_server_parameters.daemonize_connections = True
+            gw.callback_server_parameters.port = 0
+            gw.start_callback_server(gw.callback_server_parameters)
+            cbport = gw._callback_server.server_socket.getsockname()[1]
+            gw._callback_server.port = cbport
             # gateway with real port
             gw._python_proxy_port = gw._callback_server.port
             # get the GatewayServer object in JVM by ID
diff --git a/python/pyspark/streaming/flume.py b/python/pyspark/streaming/flume.py
index c0cdc50d8d423..b3d1905365925 100644
--- a/python/pyspark/streaming/flume.py
+++ b/python/pyspark/streaming/flume.py
@@ -20,7 +20,7 @@
     from io import BytesIO
 else:
     from StringIO import StringIO
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.storagelevel import StorageLevel
 from pyspark.serializers import PairDeserializer, NoOpSerializer, UTF8Deserializer, read_int
diff --git a/python/pyspark/streaming/kafka.py b/python/pyspark/streaming/kafka.py
index 8a814c64c0423..b35bbaf404cc5 100644
--- a/python/pyspark/streaming/kafka.py
+++ b/python/pyspark/streaming/kafka.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.rdd import RDD
 from pyspark.storagelevel import StorageLevel
diff --git a/python/pyspark/streaming/kinesis.py b/python/pyspark/streaming/kinesis.py
index 34be5880e1708..af72c3d6903f9 100644
--- a/python/pyspark/streaming/kinesis.py
+++ b/python/pyspark/streaming/kinesis.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.serializers import PairDeserializer, NoOpSerializer
 from pyspark.storagelevel import StorageLevel
diff --git a/python/pyspark/streaming/mqtt.py b/python/pyspark/streaming/mqtt.py
index fa83006c36db6..1ce4093196e63 100644
--- a/python/pyspark/streaming/mqtt.py
+++ b/python/pyspark/streaming/mqtt.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 #
 
-from py4j.java_gateway import Py4JJavaError
+from py4j.protocol import Py4JJavaError
 
 from pyspark.storagelevel import StorageLevel
 from pyspark.serializers import UTF8Deserializer
diff --git a/python/pyspark/streaming/tests.py b/python/pyspark/streaming/tests.py
index e4e56fff3b3fc..49634252fd465 100644
--- a/python/pyspark/streaming/tests.py
+++ b/python/pyspark/streaming/tests.py
@@ -61,9 +61,12 @@ def setUpClass(cls):
     def tearDownClass(cls):
         cls.sc.stop()
         # Clean up in the JVM just in case there has been some issues in Python API
-        jSparkContextOption = SparkContext._jvm.SparkContext.get()
-        if jSparkContextOption.nonEmpty():
-            jSparkContextOption.get().stop()
+        try:
+            jSparkContextOption = SparkContext._jvm.SparkContext.get()
+            if jSparkContextOption.nonEmpty():
+                jSparkContextOption.get().stop()
+        except:
+            pass
 
     def setUp(self):
         self.ssc = StreamingContext(self.sc, self.duration)
@@ -72,9 +75,12 @@ def tearDown(self):
         if self.ssc is not None:
             self.ssc.stop(False)
         # Clean up in the JVM just in case there has been some issues in Python API
-        jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
-        if jStreamingContextOption.nonEmpty():
-            jStreamingContextOption.get().stop(False)
+        try:
+            jStreamingContextOption = StreamingContext._jvm.SparkContext.getActive()
+            if jStreamingContextOption.nonEmpty():
+                jStreamingContextOption.get().stop(False)
+        except:
+            pass
 
     def wait_for(self, result, n):
         start_time = time.time()
diff --git a/sbin/spark-config.sh b/sbin/spark-config.sh
index b0361d72d3f2c..e6bf544c14799 100755
--- a/sbin/spark-config.sh
+++ b/sbin/spark-config.sh
@@ -36,4 +36,4 @@ export SPARK_HOME="${SPARK_PREFIX}"
 export SPARK_CONF_DIR="${SPARK_CONF_DIR:-"$SPARK_HOME/conf"}"
 # Add the PySpark classes to the PYTHONPATH:
 export PYTHONPATH="$SPARK_HOME/python:$PYTHONPATH"
-export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.8.2.1-src.zip:$PYTHONPATH"
+export PYTHONPATH="$SPARK_HOME/python/lib/py4j-0.9-src.zip:$PYTHONPATH"
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 08aecfa7f6fe0..754215db2a900 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -1008,9 +1008,9 @@ private[spark] class Client(
         val pyArchivesFile = new File(pyLibPath, "pyspark.zip")
         require(pyArchivesFile.exists(),
           "pyspark.zip not found; cannot run pyspark application in YARN mode.")
-        val py4jFile = new File(pyLibPath, "py4j-0.8.2.1-src.zip")
+        val py4jFile = new File(pyLibPath, "py4j-0.9-src.zip")
         require(py4jFile.exists(),
-          "py4j-0.8.2.1-src.zip not found; cannot run pyspark application in YARN mode.")
+          "py4j-0.9-src.zip not found; cannot run pyspark application in YARN mode.")
         Seq(pyArchivesFile.getAbsolutePath(), py4jFile.getAbsolutePath())
       }
   }
diff --git a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
index d1cd0c89b5d38..6db012a77a936 100644
--- a/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
+++ b/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnClusterSuite.scala
@@ -153,7 +153,7 @@ class YarnClusterSuite extends BaseYarnClusterSuite {
     // needed locations.
     val sparkHome = sys.props("spark.test.home");
     val pythonPath = Seq(
-        s"$sparkHome/python/lib/py4j-0.8.2.1-src.zip",
+        s"$sparkHome/python/lib/py4j-0.9-src.zip",
         s"$sparkHome/python")
     val extraEnv = Map(
       "PYSPARK_ARCHIVES_PATH" -> pythonPath.map("local:" + _).mkString(File.pathSeparator),

From 2f6dd634c169a34811738a3c230cce4462612748 Mon Sep 17 00:00:00 2001
From: vundela <vsr@cloudera.com>
Date: Tue, 20 Oct 2015 11:12:28 -0700
Subject: [PATCH 114/139] [SPARK-11105] [YARN] Distribute log4j.properties to
 executors

Currently log4j.properties file is not uploaded to executor's which is leading them to use the default values. This fix will make sure that file is always uploaded to distributed cache so that executor will use the latest settings.

If user specifies log configurations through --files then executors will be picking configs from --files instead of $SPARK_CONF_DIR/log4j.properties

Author: vundela <vsr@cloudera.com>
Author: Srinivasa Reddy Vundela <vsr@cloudera.com>

Closes #9118 from vundela/master.
---
 docs/running-on-yarn.md                             |  5 ++++-
 .../scala/org/apache/spark/deploy/yarn/Client.scala | 13 +++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 677c0000440ac..db6bfa69ee0fe 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -81,7 +81,7 @@ all environment variables used for launching each container. This process is use
 classpath problems in particular. (Note that enabling this requires admin privileges on cluster
 settings and a restart of all node managers. Thus, this is not applicable to hosted clusters).
 
-To use a custom log4j configuration for the application master or executors, there are two options:
+To use a custom log4j configuration for the application master or executors, here are the options:
 
 - upload a custom `log4j.properties` using `spark-submit`, by adding it to the `--files` list of files
   to be uploaded with the application.
@@ -89,6 +89,9 @@ To use a custom log4j configuration for the application master or executors, the
   (for the driver) or `spark.executor.extraJavaOptions` (for executors). Note that if using a file,
   the `file:` protocol should be explicitly provided, and the file needs to exist locally on all
   the nodes.
+- update the `$SPARK_CONF_DIR/log4j.properties` file and it will be automatically uploaded along
+  with the other configurations. Note that other 2 options has higher priority than this option if
+  multiple options are specified.
 
 Note that for the first option, both executors and the application master will share the same
 log4j configuration, which may cause issues when they run on the same node (e.g. trying to write
diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
index 754215db2a900..4954b6180902e 100644
--- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
+++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala
@@ -497,6 +497,19 @@ private[spark] class Client(
    */
   private def createConfArchive(): File = {
     val hadoopConfFiles = new HashMap[String, File]()
+
+    // Uploading $SPARK_CONF_DIR/log4j.properties file to the distributed cache to make sure that
+    // the executors will use the latest configurations instead of the default values. This is
+    // required when user changes log4j.properties directly to set the log configurations. If
+    // configuration file is provided through --files then executors will be taking configurations
+    // from --files instead of $SPARK_CONF_DIR/log4j.properties.
+    val log4jFileName = "log4j.properties"
+    Option(Utils.getContextOrSparkClassLoader.getResource(log4jFileName)).foreach { url =>
+      if (url.getProtocol == "file") {
+        hadoopConfFiles(log4jFileName) = new File(url.getPath)
+      }
+    }
+
     Seq("HADOOP_CONF_DIR", "YARN_CONF_DIR").foreach { envKey =>
       sys.env.get(envKey).foreach { path =>
         val dir = new File(path)

From 478c7ce8628c05ebce2972e631d76317accebe9c Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@outlook.com>
Date: Tue, 20 Oct 2015 13:36:03 -0700
Subject: [PATCH 115/139] [SPARK-6740] [SQL] correctly parse NOT operator with
 comparison operations

We can't parse `NOT` operator with comparison operations like `SELECT NOT TRUE > TRUE`, this PR fixed it.

Takes over https://github.com/apache/spark/pull/6326.

Author: Wenchen Fan <cloud0fan@outlook.com>

Closes #8617 from cloud-fan/not.
---
 .../apache/spark/sql/catalyst/SqlParser.scala |  6 ++++--
 .../spark/sql/catalyst/SqlParserSuite.scala   | 21 ++++++++++++++-----
 .../spark/sql/catalyst/plans/PlanTest.scala   |  2 +-
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 2595e1f90c837..08ca325b21777 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -218,7 +218,10 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
     andExpression * (OR ^^^ { (e1: Expression, e2: Expression) => Or(e1, e2) })
 
   protected lazy val andExpression: Parser[Expression] =
-    comparisonExpression * (AND ^^^ { (e1: Expression, e2: Expression) => And(e1, e2) })
+    notExpression * (AND ^^^ { (e1: Expression, e2: Expression) => And(e1, e2) })
+
+  protected lazy val notExpression: Parser[Expression] =
+    NOT.? ~ comparisonExpression ^^ { case maybeNot ~ e => maybeNot.map(_ => Not(e)).getOrElse(e) }
 
   protected lazy val comparisonExpression: Parser[Expression] =
     ( termExpression ~ ("="  ~> termExpression) ^^ { case e1 ~ e2 => EqualTo(e1, e2) }
@@ -246,7 +249,6 @@ object SqlParser extends AbstractSparkSQLParser with DataTypeParser {
       }
     | termExpression <~ IS ~ NULL ^^ { case e => IsNull(e) }
     | termExpression <~ IS ~ NOT ~ NULL ^^ { case e => IsNotNull(e) }
-    | NOT ~> termExpression ^^ {e => Not(e)}
     | termExpression
     )
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
index b93a3abc6ebd2..79b4846cb9544 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/SqlParserSuite.scala
@@ -17,10 +17,10 @@
 
 package org.apache.spark.sql.catalyst
 
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
-import org.apache.spark.sql.catalyst.plans.logical.Command
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAlias
+import org.apache.spark.sql.catalyst.expressions.{Literal, GreaterThan, Not, Attribute}
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project, LogicalPlan, Command}
 
 private[sql] case class TestCommand(cmd: String) extends LogicalPlan with Command {
   override def output: Seq[Attribute] = Seq.empty
@@ -49,7 +49,7 @@ private[sql] class CaseInsensitiveTestParser extends AbstractSparkSQLParser {
     }
 }
 
-class SqlParserSuite extends SparkFunSuite {
+class SqlParserSuite extends PlanTest {
 
   test("test long keyword") {
     val parser = new SuperLongKeywordTestParser
@@ -63,4 +63,15 @@ class SqlParserSuite extends SparkFunSuite {
     assert(TestCommand("NotRealCommand") === parser.parse("execute NotRealCommand"))
     assert(TestCommand("NotRealCommand") === parser.parse("exEcute NotRealCommand"))
   }
+
+  test("test NOT operator with comparison operations") {
+    val parsed = SqlParser.parse("SELECT NOT TRUE > TRUE")
+    val expected = Project(
+      UnresolvedAlias(
+        Not(
+          GreaterThan(Literal(true), Literal(true)))
+      ) :: Nil,
+      OneRowRelation)
+    comparePlans(parsed, expected)
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
index f76a903dcc9cf..2efee1fc54706 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/plans/PlanTest.scala
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.util._
 /**
  * Provides helper methods for comparing plans.
  */
-class PlanTest extends SparkFunSuite {
+abstract class PlanTest extends SparkFunSuite {
   /**
    * Since attribute references are given globally unique ids during analysis,
    * we must normalize them to check if two different queries are identical.

From 67d468f8d9172569ec9846edc6432240547696dd Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 20 Oct 2015 13:40:24 -0700
Subject: [PATCH 116/139] [SPARK-11111] [SQL] fast null-safe join

Currently, we use CartesianProduct for join with null-safe-equal condition.
```
scala> sqlContext.sql("select * from t a join t b on (a.i <=> b.i)").explain
== Physical Plan ==
TungstenProject [i#2,j#3,i#7,j#8]
 Filter (i#2 <=> i#7)
  CartesianProduct
   LocalTableScan [i#2,j#3], [[1,1]]
   LocalTableScan [i#7,j#8], [[1,1]]
```
Actually, we can have an equal-join condition as  `coalesce(i, default) = coalesce(b.i, default)`, then an partitioned join algorithm could be used.

After this PR, the plan will become:
```
>>> sqlContext.sql("select * from a join b ON a.id <=> b.id").explain()
TungstenProject [id#0L,id#1L]
 Filter (id#0L <=> id#1L)
  SortMergeJoin [coalesce(id#0L,0)], [coalesce(id#1L,0)]
   TungstenSort [coalesce(id#0L,0) ASC], false, 0
    TungstenExchange hashpartitioning(coalesce(id#0L,0),200)
     ConvertToUnsafe
      Scan PhysicalRDD[id#0L]
   TungstenSort [coalesce(id#1L,0) ASC], false, 0
    TungstenExchange hashpartitioning(coalesce(id#1L,0),200)
     ConvertToUnsafe
      Scan PhysicalRDD[id#1L]
```

Author: Davies Liu <davies@databricks.com>

Closes #9120 from davies/null_safe.
---
 .../sql/catalyst/expressions/literals.scala   | 29 +++++++++++++--
 .../sql/catalyst/planning/patterns.scala      | 35 ++++++++++++-------
 .../expressions/LiteralExpressionSuite.scala  | 28 ++++++++++++++-
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 14 ++++++++
 .../sql/execution/joins/InnerJoinSuite.scala  | 14 ++++++++
 5 files changed, 105 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
index 51be819e9d6fb..455fa2427c26d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala
@@ -19,8 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.sql.{Date, Timestamp}
 
-import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.CatalystTypeConverters
+import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
@@ -52,6 +51,32 @@ object Literal {
   def create(v: Any, dataType: DataType): Literal = {
     Literal(CatalystTypeConverters.convertToCatalyst(v), dataType)
   }
+
+  /**
+   * Create a literal with default value for given DataType
+   */
+  def default(dataType: DataType): Literal = dataType match {
+    case NullType => create(null, NullType)
+    case BooleanType => Literal(false)
+    case ByteType => Literal(0.toByte)
+    case ShortType => Literal(0.toShort)
+    case IntegerType => Literal(0)
+    case LongType => Literal(0L)
+    case FloatType => Literal(0.0f)
+    case DoubleType => Literal(0.0)
+    case dt: DecimalType => Literal(Decimal(0, dt.precision, dt.scale))
+    case DateType => create(0, DateType)
+    case TimestampType => create(0L, TimestampType)
+    case StringType => Literal("")
+    case BinaryType => Literal("".getBytes)
+    case CalendarIntervalType => Literal(new CalendarInterval(0, 0))
+    case arr: ArrayType => create(Array(), arr)
+    case map: MapType => create(Map(), map)
+    case struct: StructType =>
+      create(InternalRow.fromSeq(struct.fields.map(f => default(f.dataType).value)), struct)
+    case other =>
+      throw new RuntimeException(s"no default for type $dataType")
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 53537799517ce..3b975b904a332 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.catalyst.planning
 
 import org.apache.spark.Logging
-import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.trees.TreeNodeRef
 
 /**
  * A pattern that matches any number of project or filter operations on top of another relational
@@ -160,6 +160,9 @@ object PartialAggregation {
 
 /**
  * A pattern that finds joins with equality conditions that can be evaluated using equi-join.
+ *
+ * Null-safe equality will be transformed into equality as joining key (replace null with default
+ * value).
  */
 object ExtractEquiJoinKeys extends Logging with PredicateHelper {
   /** (joinType, leftKeys, rightKeys, condition, leftChild, rightChild) */
@@ -171,17 +174,25 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper {
       logDebug(s"Considering join on: $condition")
       // Find equi-join predicates that can be evaluated before the join, and thus can be used
       // as join keys.
-      val (joinPredicates, otherPredicates) =
-        condition.map(splitConjunctivePredicates).getOrElse(Nil).partition {
-          case EqualTo(l, r) =>
-            (canEvaluate(l, left) && canEvaluate(r, right)) ||
-            (canEvaluate(l, right) && canEvaluate(r, left))
-          case _ => false
-        }
-
-      val joinKeys = joinPredicates.map {
-        case EqualTo(l, r) if canEvaluate(l, left) && canEvaluate(r, right) => (l, r)
-        case EqualTo(l, r) if canEvaluate(l, right) && canEvaluate(r, left) => (r, l)
+      val predicates = condition.map(splitConjunctivePredicates).getOrElse(Nil)
+      val joinKeys = predicates.flatMap {
+        case EqualTo(l, r) if canEvaluate(l, left) && canEvaluate(r, right) => Some((l, r))
+        case EqualTo(l, r) if canEvaluate(l, right) && canEvaluate(r, left) => Some((r, l))
+        // Replace null with default value for joining key, then those rows with null in it could
+        // be joined together
+        case EqualNullSafe(l, r) if canEvaluate(l, left) && canEvaluate(r, right) =>
+          Some((Coalesce(Seq(l, Literal.default(l.dataType))),
+            Coalesce(Seq(r, Literal.default(r.dataType)))))
+        case EqualNullSafe(l, r) if canEvaluate(l, right) && canEvaluate(r, left) =>
+          Some((Coalesce(Seq(r, Literal.default(r.dataType))),
+            Coalesce(Seq(l, Literal.default(l.dataType)))))
+        case other => None
+      }
+      val otherPredicates = predicates.filterNot {
+        case EqualTo(l, r) =>
+          canEvaluate(l, left) && canEvaluate(r, right) ||
+            canEvaluate(l, right) && canEvaluate(r, left)
+        case other => false
       }
 
       if (joinKeys.nonEmpty) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
index 015eb1897fb8c..7b85286c4dc8c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala
@@ -18,7 +18,10 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.CalendarInterval
 
 
 class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
@@ -30,15 +33,38 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(Literal.create(null, IntegerType), null)
     checkEvaluation(Literal.create(null, LongType), null)
     checkEvaluation(Literal.create(null, FloatType), null)
-    checkEvaluation(Literal.create(null, LongType), null)
+    checkEvaluation(Literal.create(null, DoubleType), null)
     checkEvaluation(Literal.create(null, StringType), null)
     checkEvaluation(Literal.create(null, BinaryType), null)
     checkEvaluation(Literal.create(null, DecimalType.USER_DEFAULT), null)
+    checkEvaluation(Literal.create(null, DateType), null)
+    checkEvaluation(Literal.create(null, TimestampType), null)
+    checkEvaluation(Literal.create(null, CalendarIntervalType), null)
     checkEvaluation(Literal.create(null, ArrayType(ByteType, true)), null)
     checkEvaluation(Literal.create(null, MapType(StringType, IntegerType)), null)
     checkEvaluation(Literal.create(null, StructType(Seq.empty)), null)
   }
 
+  test("default") {
+    checkEvaluation(Literal.default(BooleanType), false)
+    checkEvaluation(Literal.default(ByteType), 0.toByte)
+    checkEvaluation(Literal.default(ShortType), 0.toShort)
+    checkEvaluation(Literal.default(IntegerType), 0)
+    checkEvaluation(Literal.default(LongType), 0L)
+    checkEvaluation(Literal.default(FloatType), 0.0f)
+    checkEvaluation(Literal.default(DoubleType), 0.0)
+    checkEvaluation(Literal.default(StringType), "")
+    checkEvaluation(Literal.default(BinaryType), "".getBytes)
+    checkEvaluation(Literal.default(DecimalType.USER_DEFAULT), Decimal(0))
+    checkEvaluation(Literal.default(DecimalType.SYSTEM_DEFAULT), Decimal(0))
+    checkEvaluation(Literal.default(DateType), DateTimeUtils.toJavaDate(0))
+    checkEvaluation(Literal.default(TimestampType), DateTimeUtils.toJavaTimestamp(0L))
+    checkEvaluation(Literal.default(CalendarIntervalType), new CalendarInterval(0, 0L))
+    checkEvaluation(Literal.default(ArrayType(StringType)), Array())
+    checkEvaluation(Literal.default(MapType(IntegerType, StringType)), Map())
+    checkEvaluation(Literal.default(StructType(StructField("a", StringType) :: Nil)), Row(""))
+  }
+
   test("boolean literals") {
     checkEvaluation(Literal(true), true)
     checkEvaluation(Literal(false), false)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 636591630e136..a35a7f41dd667 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.DefaultParserDialect
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
 import org.apache.spark.sql.catalyst.errors.DialectException
 import org.apache.spark.sql.execution.aggregate
+import org.apache.spark.sql.execution.joins.{SortMergeJoin, CartesianProduct}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.test.SQLTestData._
 import org.apache.spark.sql.test.{SharedSQLContext, TestSQLContext}
@@ -850,6 +851,19 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       Row(null, null, 6, "F") :: Nil)
   }
 
+  test("SPARK-11111 null-safe join should not use cartesian product") {
+    val df = sql("select count(*) from testData a join testData b on (a.key <=> b.key)")
+    val cp = df.queryExecution.executedPlan.collect {
+      case cp: CartesianProduct => cp
+    }
+    assert(cp.isEmpty, "should not use CartesianProduct for null-safe join")
+    val smj = df.queryExecution.executedPlan.collect {
+      case smj: SortMergeJoin => smj
+    }
+    assert(smj.size > 0, "should use SortMergeJoin")
+    checkAnswer(df, Row(100) :: Nil)
+  }
+
   test("SPARK-3349 partitioning after limit") {
     sql("SELECT DISTINCT n FROM lowerCaseData ORDER BY n DESC")
       .limit(2)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
index 4174ee055021d..da58e96f3e6f7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/InnerJoinSuite.scala
@@ -212,4 +212,18 @@ class InnerJoinSuite extends SparkPlanTest with SharedSQLContext {
     )
   }
 
+  {
+    lazy val left = Seq((1, Some(0)), (2, None)).toDF("a", "b")
+    lazy val right = Seq((1, Some(0)), (2, None)).toDF("a", "b")
+    testInnerJoin(
+      "inner join, null safe",
+      left,
+      right,
+      () => (left.col("b") <=> right.col("b")).expr,
+      Seq(
+        (1, 0, 1, 0),
+        (2, null, 2, null)
+      )
+    )
+  }
 }

From 06e6b765d0c747b773d7f3be28ddb0543c955a1f Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Tue, 20 Oct 2015 14:01:53 -0700
Subject: [PATCH 117/139] [SPARK-11149] [SQL] Improve cache performance for
 primitive types

This PR improve the performance by:

1) Generate an Iterator that take Iterator[CachedBatch] as input, and call accessors (unroll the loop for columns), avoid the expensive Iterator.flatMap.

2) Use Unsafe.getInt/getLong/getFloat/getDouble instead of ByteBuffer.getInt/getLong/getFloat/getDouble, the later one actually read byte by byte.

3) Remove the unnecessary copy() in Coalesce(), which is not related to memory cache, found during benchmark.

The following benchmark showed that we can speedup the columnar cache of int by 2x.

```
path = '/opt/tpcds/store_sales/'
int_cols = ['ss_sold_date_sk', 'ss_sold_time_sk', 'ss_item_sk','ss_customer_sk']
df = sqlContext.read.parquet(path).select(int_cols).cache()
df.count()

t = time.time()
print df.select("*")._jdf.queryExecution().toRdd().count()
print time.time() - t
```

Author: Davies Liu <davies@databricks.com>

Closes #9145 from davies/byte_buffer.
---
 .../expressions/codegen/CodeFormatter.scala   |   3 +
 .../expressions/codegen/CodeGenerator.scala   |  10 +-
 .../codegen/CodeFormatterSuite.scala          |  66 ++++----
 .../spark/sql/columnar/ColumnType.scala       |  52 ++++--
 .../sql/columnar/GenerateColumnAccessor.scala | 149 ++++++++++++++++++
 .../columnar/InMemoryColumnarTableScan.scala  |  83 ++++------
 .../sql/columnar/NullableColumnAccessor.scala |   6 +-
 .../compression/compressionSchemes.scala      |  16 +-
 .../spark/sql/execution/basicOperators.scala  |   2 +-
 9 files changed, 265 insertions(+), 122 deletions(-)
 create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
index c98182c96b165..9b8b6382d753d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatter.scala
@@ -32,6 +32,7 @@ private class CodeFormatter {
   private var indentLevel = 0
   private val indentSize = 2
   private var indentString = ""
+  private var currentLine = 1
 
   private def addLine(line: String): Unit = {
     val indentChange =
@@ -44,11 +45,13 @@ private class CodeFormatter {
     } else {
       indentString
     }
+    code.append(f"/* ${currentLine}%03d */ ")
     code.append(thisLineIndent)
     code.append(line)
     code.append("\n")
     indentLevel = newIndentLevel
     indentString = " " * (indentSize * newIndentLevel)
+    currentLine += 1
   }
 
   private def addLines(code: String): CodeFormatter = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 7544d27e3dc15..a4ec5085fa153 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -391,26 +391,24 @@ abstract class CodeGenerator[InType <: AnyRef, OutType <: AnyRef] extends Loggin
       classOf[ArrayData].getName,
       classOf[UnsafeArrayData].getName,
       classOf[MapData].getName,
-      classOf[UnsafeMapData].getName
+      classOf[UnsafeMapData].getName,
+      classOf[MutableRow].getName
     ))
     evaluator.setExtendedClass(classOf[GeneratedClass])
 
     def formatted = CodeFormatter.format(code)
-    def withLineNums = formatted.split("\n").zipWithIndex.map {
-      case (l, n) => f"${n + 1}%03d $l"
-    }.mkString("\n")
 
     logDebug({
       // Only add extra debugging info to byte code when we are going to print the source code.
       evaluator.setDebuggingInformation(true, true, false)
-      withLineNums
+      formatted
     })
 
     try {
       evaluator.cook("generated.java", code)
     } catch {
       case e: Exception =>
-        val msg = s"failed to compile: $e\n$withLineNums"
+        val msg = s"failed to compile: $e\n$formatted"
         logError(msg, e)
         throw new Exception(msg, e)
     }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
index 46daa3eb8bf80..9da1068e9ca1d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeFormatterSuite.scala
@@ -29,78 +29,68 @@ class CodeFormatterSuite extends SparkFunSuite {
   }
 
   testCase("basic example") {
-    """
-      |class A {
+    """class A {
       |blahblah;
-      |}
-    """.stripMargin
+      |}""".stripMargin
   }{
     """
-      |class A {
-      |  blahblah;
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   blahblah;
+      |/* 003 */ }
     """.stripMargin
   }
 
   testCase("nested example") {
-    """
-      |class A {
+    """class A {
       | if (c) {
       |duh;
       |}
-      |}
-    """.stripMargin
+      |}""".stripMargin
   } {
     """
-      |class A {
-      |  if (c) {
-      |    duh;
-      |  }
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   if (c) {
+      |/* 003 */     duh;
+      |/* 004 */   }
+      |/* 005 */ }
     """.stripMargin
   }
 
   testCase("single line") {
-    """
-      |class A {
+    """class A {
       | if (c) {duh;}
-      |}
-    """.stripMargin
+      |}""".stripMargin
   }{
     """
-      |class A {
-      |  if (c) {duh;}
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   if (c) {duh;}
+      |/* 003 */ }
     """.stripMargin
   }
 
   testCase("if else on the same line") {
-    """
-      |class A {
+    """class A {
       | if (c) {duh;} else {boo;}
-      |}
-    """.stripMargin
+      |}""".stripMargin
   }{
     """
-      |class A {
-      |  if (c) {duh;} else {boo;}
-      |}
+      |/* 001 */ class A {
+      |/* 002 */   if (c) {duh;} else {boo;}
+      |/* 003 */ }
     """.stripMargin
   }
 
   testCase("function calls") {
-    """
-      |foo(
+    """foo(
       |a,
       |b,
-      |c)
-    """.stripMargin
+      |c)""".stripMargin
   }{
     """
-      |foo(
-      |  a,
-      |  b,
-      |  c)
+      |/* 001 */ foo(
+      |/* 002 */   a,
+      |/* 003 */   b,
+      |/* 004 */   c)
     """.stripMargin
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
index a41f04dd3b59a..72fa299aa937b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/ColumnType.scala
@@ -28,6 +28,38 @@ import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.unsafe.types.UTF8String
 
+
+/**
+ * A help class for fast reading Int/Long/Float/Double from ByteBuffer in native order.
+ *
+ * WARNNING: This only works with HeapByteBuffer
+ */
+object ByteBufferHelper {
+  def getInt(buffer: ByteBuffer): Int = {
+    val pos = buffer.position()
+    buffer.position(pos + 4)
+    Platform.getInt(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+
+  def getLong(buffer: ByteBuffer): Long = {
+    val pos = buffer.position()
+    buffer.position(pos + 8)
+    Platform.getLong(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+
+  def getFloat(buffer: ByteBuffer): Float = {
+    val pos = buffer.position()
+    buffer.position(pos + 4)
+    Platform.getFloat(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+
+  def getDouble(buffer: ByteBuffer): Double = {
+    val pos = buffer.position()
+    buffer.position(pos + 8)
+    Platform.getDouble(buffer.array(), Platform.BYTE_ARRAY_OFFSET + pos)
+  }
+}
+
 /**
  * An abstract class that represents type of a column. Used to append/extract Java objects into/from
  * the underlying [[ByteBuffer]] of a column.
@@ -134,11 +166,11 @@ private[sql] object INT extends NativeColumnType(IntegerType, 4) {
   }
 
   override def extract(buffer: ByteBuffer): Int = {
-    buffer.getInt()
+    ByteBufferHelper.getInt(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setInt(ordinal, buffer.getInt())
+    row.setInt(ordinal, ByteBufferHelper.getInt(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Int): Unit = {
@@ -163,11 +195,11 @@ private[sql] object LONG extends NativeColumnType(LongType, 8) {
   }
 
   override def extract(buffer: ByteBuffer): Long = {
-    buffer.getLong()
+    ByteBufferHelper.getLong(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setLong(ordinal, buffer.getLong())
+    row.setLong(ordinal, ByteBufferHelper.getLong(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Long): Unit = {
@@ -191,11 +223,11 @@ private[sql] object FLOAT extends NativeColumnType(FloatType, 4) {
   }
 
   override def extract(buffer: ByteBuffer): Float = {
-    buffer.getFloat()
+    ByteBufferHelper.getFloat(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setFloat(ordinal, buffer.getFloat())
+    row.setFloat(ordinal, ByteBufferHelper.getFloat(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Float): Unit = {
@@ -219,11 +251,11 @@ private[sql] object DOUBLE extends NativeColumnType(DoubleType, 8) {
   }
 
   override def extract(buffer: ByteBuffer): Double = {
-    buffer.getDouble()
+    ByteBufferHelper.getDouble(buffer)
   }
 
   override def extract(buffer: ByteBuffer, row: MutableRow, ordinal: Int): Unit = {
-    row.setDouble(ordinal, buffer.getDouble())
+    row.setDouble(ordinal, ByteBufferHelper.getDouble(buffer))
   }
 
   override def setField(row: MutableRow, ordinal: Int, value: Double): Unit = {
@@ -358,7 +390,7 @@ private[sql] case class COMPACT_DECIMAL(precision: Int, scale: Int)
   extends NativeColumnType(DecimalType(precision, scale), 8) {
 
   override def extract(buffer: ByteBuffer): Decimal = {
-    Decimal(buffer.getLong(), precision, scale)
+    Decimal(ByteBufferHelper.getLong(buffer), precision, scale)
   }
 
   override def append(v: Decimal, buffer: ByteBuffer): Unit = {
@@ -480,7 +512,7 @@ private[sql] case class STRUCT(dataType: StructType) extends ColumnType[UnsafeRo
   }
 
   override def extract(buffer: ByteBuffer): UnsafeRow = {
-    val sizeInBytes = buffer.getInt()
+    val sizeInBytes = ByteBufferHelper.getInt(buffer)
     assert(buffer.hasArray)
     val cursor = buffer.position()
     buffer.position(cursor + sizeInBytes)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
new file mode 100644
index 0000000000000..e04bcda5800c7
--- /dev/null
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/GenerateColumnAccessor.scala
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.columnar
+
+import org.apache.spark.Logging
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodeGenerator}
+import org.apache.spark.sql.types._
+
+/**
+ * An Iterator to walk throught the InternalRows from a CachedBatch
+ */
+abstract class ColumnarIterator extends Iterator[InternalRow] {
+  def initialize(input: Iterator[CachedBatch], mutableRow: MutableRow, columnTypes: Array[DataType],
+    columnIndexes: Array[Int]): Unit
+}
+
+/**
+ * Generates bytecode for an [[ColumnarIterator]] for columnar cache.
+ */
+object GenerateColumnAccessor extends CodeGenerator[Seq[DataType], ColumnarIterator] with Logging {
+
+  protected def canonicalize(in: Seq[DataType]): Seq[DataType] = in
+  protected def bind(in: Seq[DataType], inputSchema: Seq[Attribute]): Seq[DataType] = in
+
+  protected def create(columnTypes: Seq[DataType]): ColumnarIterator = {
+    val ctx = newCodeGenContext()
+    val (initializeAccessors, extractors) = columnTypes.zipWithIndex.map { case (dt, index) =>
+      val accessorName = ctx.freshName("accessor")
+      val accessorCls = dt match {
+        case NullType => classOf[NullColumnAccessor].getName
+        case BooleanType => classOf[BooleanColumnAccessor].getName
+        case ByteType => classOf[ByteColumnAccessor].getName
+        case ShortType => classOf[ShortColumnAccessor].getName
+        case IntegerType | DateType => classOf[IntColumnAccessor].getName
+        case LongType | TimestampType => classOf[LongColumnAccessor].getName
+        case FloatType => classOf[FloatColumnAccessor].getName
+        case DoubleType => classOf[DoubleColumnAccessor].getName
+        case StringType => classOf[StringColumnAccessor].getName
+        case BinaryType => classOf[BinaryColumnAccessor].getName
+        case dt: DecimalType if dt.precision <= Decimal.MAX_LONG_DIGITS =>
+          classOf[CompactDecimalColumnAccessor].getName
+        case dt: DecimalType => classOf[DecimalColumnAccessor].getName
+        case struct: StructType => classOf[StructColumnAccessor].getName
+        case array: ArrayType => classOf[ArrayColumnAccessor].getName
+        case t: MapType => classOf[MapColumnAccessor].getName
+      }
+      ctx.addMutableState(accessorCls, accessorName, s"$accessorName = null;")
+
+      val createCode = dt match {
+        case t if ctx.isPrimitiveType(dt) =>
+          s"$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder));"
+        case NullType | StringType | BinaryType =>
+          s"$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder));"
+        case other =>
+          s"""$accessorName = new $accessorCls(ByteBuffer.wrap(buffers[$index]).order(nativeOrder),
+             (${dt.getClass.getName}) columnTypes[$index]);"""
+      }
+
+      val extract = s"$accessorName.extractTo(mutableRow, $index);"
+
+      (createCode, extract)
+    }.unzip
+
+    val code = s"""
+      import java.nio.ByteBuffer;
+      import java.nio.ByteOrder;
+
+      public SpecificColumnarIterator generate($exprType[] expr) {
+        return new SpecificColumnarIterator();
+      }
+
+      class SpecificColumnarIterator extends ${classOf[ColumnarIterator].getName} {
+
+        private ByteOrder nativeOrder = null;
+        private byte[][] buffers = null;
+
+        private int currentRow = 0;
+        private int numRowsInBatch = 0;
+
+        private scala.collection.Iterator input = null;
+        private MutableRow mutableRow = null;
+        private ${classOf[DataType].getName}[] columnTypes = null;
+        private int[] columnIndexes = null;
+
+        ${declareMutableStates(ctx)}
+
+        public SpecificColumnarIterator() {
+          this.nativeOrder = ByteOrder.nativeOrder();
+          this.buffers = new byte[${columnTypes.length}][];
+
+          ${initMutableStates(ctx)}
+        }
+
+        public void initialize(scala.collection.Iterator input, MutableRow mutableRow,
+                               ${classOf[DataType].getName}[] columnTypes, int[] columnIndexes) {
+          this.input = input;
+          this.mutableRow = mutableRow;
+          this.columnTypes = columnTypes;
+          this.columnIndexes = columnIndexes;
+        }
+
+        public boolean hasNext() {
+          if (currentRow < numRowsInBatch) {
+            return true;
+          }
+          if (!input.hasNext()) {
+            return false;
+          }
+
+          ${classOf[CachedBatch].getName} batch = (${classOf[CachedBatch].getName}) input.next();
+          currentRow = 0;
+          numRowsInBatch = batch.numRows();
+          for (int i = 0; i < columnIndexes.length; i ++) {
+            buffers[i] = batch.buffers()[columnIndexes[i]];
+          }
+          ${initializeAccessors.mkString("\n")}
+
+          return hasNext();
+        }
+
+        public InternalRow next() {
+          ${extractors.mkString("\n")}
+          currentRow += 1;
+          return mutableRow;
+        }
+      }"""
+
+    logDebug(s"Generated ColumnarIterator: ${CodeFormatter.format(code)}")
+
+    compile(code).generate(ctx.references.toArray).asInstanceOf[ColumnarIterator]
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
index d967814f627cb..9f76a61a1574b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/InMemoryColumnarTableScan.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.sql.columnar
 
-import java.nio.ByteBuffer
-
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.rdd.RDD
@@ -28,6 +26,7 @@ import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
 import org.apache.spark.sql.execution.{ConvertToUnsafe, LeafNode, SparkPlan}
+import org.apache.spark.sql.types.UserDefinedType
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.{Accumulable, Accumulator, Accumulators}
 
@@ -43,7 +42,14 @@ private[sql] object InMemoryRelation {
       tableName)()
 }
 
-private[sql] case class CachedBatch(buffers: Array[Array[Byte]], stats: InternalRow)
+/**
+ * CachedBatch is a cached batch of rows.
+ *
+ * @param numRows The total number of rows in this batch
+ * @param buffers The buffers for serialized columns
+ * @param stats The stat of columns
+ */
+private[sql] case class CachedBatch(numRows: Int, buffers: Array[Array[Byte]], stats: InternalRow)
 
 private[sql] case class InMemoryRelation(
     output: Seq[Attribute],
@@ -151,7 +157,7 @@ private[sql] case class InMemoryRelation(
                         .flatMap(_.values))
 
           batchStats += stats
-          CachedBatch(columnBuilders.map(_.build().array()), stats)
+          CachedBatch(rowCount, columnBuilders.map(_.build().array()), stats)
         }
 
         def hasNext: Boolean = rowIterator.hasNext
@@ -278,59 +284,15 @@ private[sql] case class InMemoryColumnarTableScan(
     val buffers = relation.cachedColumnBuffers
 
     buffers.mapPartitions { cachedBatchIterator =>
-    val partitionFilter = newPredicate(
-      partitionFilters.reduceOption(And).getOrElse(Literal(true)),
-      schema)
-
-      // Find the ordinals and data types of the requested columns.  If none are requested, use the
-      // narrowest (the field with minimum default element size).
-      val (requestedColumnIndices, requestedColumnDataTypes) = if (attributes.isEmpty) {
-        val (narrowestOrdinal, narrowestDataType) =
-          relOutput.zipWithIndex.map { case (a, ordinal) =>
-            ordinal -> a.dataType
-          } minBy { case (_, dataType) =>
-            ColumnType(dataType).defaultSize
-          }
-        Seq(narrowestOrdinal) -> Seq(narrowestDataType)
-      } else {
+      val partitionFilter = newPredicate(
+        partitionFilters.reduceOption(And).getOrElse(Literal(true)),
+        schema)
+
+      // Find the ordinals and data types of the requested columns.
+      val (requestedColumnIndices, requestedColumnDataTypes) =
         attributes.map { a =>
           relOutput.indexWhere(_.exprId == a.exprId) -> a.dataType
         }.unzip
-      }
-
-      val nextRow = new SpecificMutableRow(requestedColumnDataTypes)
-
-      def cachedBatchesToRows(cacheBatches: Iterator[CachedBatch]): Iterator[InternalRow] = {
-        val rows = cacheBatches.flatMap { cachedBatch =>
-          // Build column accessors
-          val columnAccessors = requestedColumnIndices.map { batchColumnIndex =>
-            ColumnAccessor(
-              relOutput(batchColumnIndex).dataType,
-              ByteBuffer.wrap(cachedBatch.buffers(batchColumnIndex)))
-          }
-
-          // Extract rows via column accessors
-          new Iterator[InternalRow] {
-            private[this] val rowLen = nextRow.numFields
-            override def next(): InternalRow = {
-              var i = 0
-              while (i < rowLen) {
-                columnAccessors(i).extractTo(nextRow, i)
-                i += 1
-              }
-              if (attributes.isEmpty) InternalRow.empty else nextRow
-            }
-
-            override def hasNext: Boolean = columnAccessors(0).hasNext
-          }
-        }
-
-        if (rows.hasNext && enableAccumulators) {
-          readPartitions += 1
-        }
-
-        rows
-      }
 
       // Do partition batch pruning if enabled
       val cachedBatchesToScan =
@@ -355,7 +317,18 @@ private[sql] case class InMemoryColumnarTableScan(
           cachedBatchIterator
         }
 
-      cachedBatchesToRows(cachedBatchesToScan)
+      val nextRow = new SpecificMutableRow(requestedColumnDataTypes)
+      val columnTypes = requestedColumnDataTypes.map {
+        case udt: UserDefinedType[_] => udt.sqlType
+        case other => other
+      }.toArray
+      val columnarIterator = GenerateColumnAccessor.generate(columnTypes)
+      columnarIterator.initialize(cachedBatchesToScan, nextRow, columnTypes,
+        requestedColumnIndices.toArray)
+      if (enableAccumulators && columnarIterator.hasNext) {
+        readPartitions += 1
+      }
+      columnarIterator
     }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala
index 4d35650d4b1eb..7eaecfe047c3f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/NullableColumnAccessor.scala
@@ -31,8 +31,8 @@ private[sql] trait NullableColumnAccessor extends ColumnAccessor {
 
   abstract override protected def initialize(): Unit = {
     nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder())
-    nullCount = nullsBuffer.getInt()
-    nextNullIndex = if (nullCount > 0) nullsBuffer.getInt() else -1
+    nullCount = ByteBufferHelper.getInt(nullsBuffer)
+    nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
     pos = 0
 
     underlyingBuffer.position(underlyingBuffer.position + 4 + nullCount * 4)
@@ -44,7 +44,7 @@ private[sql] trait NullableColumnAccessor extends ColumnAccessor {
       seenNulls += 1
 
       if (seenNulls < nullCount) {
-        nextNullIndex = nullsBuffer.getInt()
+        nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
       }
 
       row.setNullAt(ordinal)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
index ca910a99db082..41c9a284e3e4a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/columnar/compression/compressionSchemes.scala
@@ -20,13 +20,11 @@ package org.apache.spark.sql.columnar.compression
 import java.nio.ByteBuffer
 
 import scala.collection.mutable
-import scala.reflect.ClassTag
-import scala.reflect.runtime.universe.runtimeMirror
+
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{MutableRow, SpecificMutableRow}
 import org.apache.spark.sql.columnar._
 import org.apache.spark.sql.types._
-import org.apache.spark.util.Utils
 
 
 private[sql] case object PassThrough extends CompressionScheme {
@@ -161,7 +159,7 @@ private[sql] case object RunLengthEncoding extends CompressionScheme {
     override def next(row: MutableRow, ordinal: Int): Unit = {
       if (valueCount == run) {
         currentValue = columnType.extract(buffer)
-        run = buffer.getInt()
+        run = ByteBufferHelper.getInt(buffer)
         valueCount = 1
       } else {
         valueCount += 1
@@ -271,7 +269,7 @@ private[sql] case object DictionaryEncoding extends CompressionScheme {
     extends compression.Decoder[T] {
 
     private val dictionary: Array[Any] = {
-      val elementNum = buffer.getInt()
+      val elementNum = ByteBufferHelper.getInt(buffer)
       Array.fill[Any](elementNum)(columnType.extract(buffer).asInstanceOf[Any])
     }
 
@@ -352,7 +350,7 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
   }
 
   class Decoder(buffer: ByteBuffer) extends compression.Decoder[BooleanType.type] {
-    private val count = buffer.getInt()
+    private val count = ByteBufferHelper.getInt(buffer)
 
     private var currentWord = 0: Long
 
@@ -363,7 +361,7 @@ private[sql] case object BooleanBitSet extends CompressionScheme {
 
       visited += 1
       if (bit == 0) {
-        currentWord = buffer.getLong()
+        currentWord = ByteBufferHelper.getLong(buffer)
       }
 
       row.setBoolean(ordinal, ((currentWord >> bit) & 1) != 0)
@@ -447,7 +445,7 @@ private[sql] case object IntDelta extends CompressionScheme {
 
     override def next(row: MutableRow, ordinal: Int): Unit = {
       val delta = buffer.get()
-      prev = if (delta > Byte.MinValue) prev + delta else buffer.getInt()
+      prev = if (delta > Byte.MinValue) prev + delta else ByteBufferHelper.getInt(buffer)
       row.setInt(ordinal, prev)
     }
   }
@@ -527,7 +525,7 @@ private[sql] case object LongDelta extends CompressionScheme {
 
     override def next(row: MutableRow, ordinal: Int): Unit = {
       val delta = buffer.get()
-      prev = if (delta > Byte.MinValue) prev + delta else buffer.getLong()
+      prev = if (delta > Byte.MinValue) prev + delta else ByteBufferHelper.getLong(buffer)
       row.setLong(ordinal, prev)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
index 4db9f4ee67bb0..dc38fe59feed5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicOperators.scala
@@ -271,7 +271,7 @@ case class Coalesce(numPartitions: Int, child: SparkPlan) extends UnaryNode {
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    child.execute().map(_.copy()).coalesce(numPartitions, shuffle = false)
+    child.execute().coalesce(numPartitions, shuffle = false)
   }
 
   override def canProcessUnsafeRows: Boolean = true

From 82e9d9c81b7a8af94e312035328da872684f6e67 Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Tue, 20 Oct 2015 15:05:02 -0700
Subject: [PATCH 118/139] [SPARK-10272][PYSPARK][MLLIB] Added @since tags to
 pyspark.mllib.evaluation

Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings).

Added since to public methods + "versionadded::" to classes (derived from the git file history in pyspark).

Note - I added also the tags to MultilabelMetrics even though it isn't declared as public in the __all__ statement... if that's incorrect - I'll remove.

Author: noelsmith <mail@noelsmith.com>

Closes #8628 from noel-smith/SPARK-10272-since-mllib-evalutation.
---
 python/pyspark/mllib/evaluation.py | 41 ++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index a90e5c50e54b9..8c87ee9df2132 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from pyspark import since
 from pyspark.mllib.common import JavaModelWrapper, callMLlibFunc
 from pyspark.sql import SQLContext
 from pyspark.sql.types import StructField, StructType, DoubleType, IntegerType, ArrayType
@@ -37,6 +38,8 @@ class BinaryClassificationMetrics(JavaModelWrapper):
     >>> metrics.areaUnderPR
     0.83...
     >>> metrics.unpersist()
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, scoreAndLabels):
@@ -50,6 +53,7 @@ def __init__(self, scoreAndLabels):
         super(BinaryClassificationMetrics, self).__init__(java_model)
 
     @property
+    @since('1.4.0')
     def areaUnderROC(self):
         """
         Computes the area under the receiver operating characteristic
@@ -58,12 +62,14 @@ def areaUnderROC(self):
         return self.call("areaUnderROC")
 
     @property
+    @since('1.4.0')
     def areaUnderPR(self):
         """
         Computes the area under the precision-recall curve.
         """
         return self.call("areaUnderPR")
 
+    @since('1.4.0')
     def unpersist(self):
         """
         Unpersists intermediate RDDs used in the computation.
@@ -91,6 +97,8 @@ class RegressionMetrics(JavaModelWrapper):
     0.61...
     >>> metrics.r2
     0.94...
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndObservations):
@@ -104,6 +112,7 @@ def __init__(self, predictionAndObservations):
         super(RegressionMetrics, self).__init__(java_model)
 
     @property
+    @since('1.4.0')
     def explainedVariance(self):
         """
         Returns the explained variance regression score.
@@ -112,6 +121,7 @@ def explainedVariance(self):
         return self.call("explainedVariance")
 
     @property
+    @since('1.4.0')
     def meanAbsoluteError(self):
         """
         Returns the mean absolute error, which is a risk function corresponding to the
@@ -120,6 +130,7 @@ def meanAbsoluteError(self):
         return self.call("meanAbsoluteError")
 
     @property
+    @since('1.4.0')
     def meanSquaredError(self):
         """
         Returns the mean squared error, which is a risk function corresponding to the
@@ -128,6 +139,7 @@ def meanSquaredError(self):
         return self.call("meanSquaredError")
 
     @property
+    @since('1.4.0')
     def rootMeanSquaredError(self):
         """
         Returns the root mean squared error, which is defined as the square root of
@@ -136,6 +148,7 @@ def rootMeanSquaredError(self):
         return self.call("rootMeanSquaredError")
 
     @property
+    @since('1.4.0')
     def r2(self):
         """
         Returns R^2^, the coefficient of determination.
@@ -178,6 +191,8 @@ class MulticlassMetrics(JavaModelWrapper):
     0.66...
     >>> metrics.weightedFMeasure(2.0)
     0.65...
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndLabels):
@@ -190,6 +205,7 @@ def __init__(self, predictionAndLabels):
         java_model = java_class(df._jdf)
         super(MulticlassMetrics, self).__init__(java_model)
 
+    @since('1.4.0')
     def confusionMatrix(self):
         """
         Returns confusion matrix: predicted classes are in columns,
@@ -197,18 +213,21 @@ def confusionMatrix(self):
         """
         return self.call("confusionMatrix")
 
+    @since('1.4.0')
     def truePositiveRate(self, label):
         """
         Returns true positive rate for a given label (category).
         """
         return self.call("truePositiveRate", label)
 
+    @since('1.4.0')
     def falsePositiveRate(self, label):
         """
         Returns false positive rate for a given label (category).
         """
         return self.call("falsePositiveRate", label)
 
+    @since('1.4.0')
     def precision(self, label=None):
         """
         Returns precision or precision for a given label (category) if specified.
@@ -218,6 +237,7 @@ def precision(self, label=None):
         else:
             return self.call("precision", float(label))
 
+    @since('1.4.0')
     def recall(self, label=None):
         """
         Returns recall or recall for a given label (category) if specified.
@@ -227,6 +247,7 @@ def recall(self, label=None):
         else:
             return self.call("recall", float(label))
 
+    @since('1.4.0')
     def fMeasure(self, label=None, beta=None):
         """
         Returns f-measure or f-measure for a given label (category) if specified.
@@ -243,6 +264,7 @@ def fMeasure(self, label=None, beta=None):
                 return self.call("fMeasure", label, beta)
 
     @property
+    @since('1.4.0')
     def weightedTruePositiveRate(self):
         """
         Returns weighted true positive rate.
@@ -251,6 +273,7 @@ def weightedTruePositiveRate(self):
         return self.call("weightedTruePositiveRate")
 
     @property
+    @since('1.4.0')
     def weightedFalsePositiveRate(self):
         """
         Returns weighted false positive rate.
@@ -258,6 +281,7 @@ def weightedFalsePositiveRate(self):
         return self.call("weightedFalsePositiveRate")
 
     @property
+    @since('1.4.0')
     def weightedRecall(self):
         """
         Returns weighted averaged recall.
@@ -266,12 +290,14 @@ def weightedRecall(self):
         return self.call("weightedRecall")
 
     @property
+    @since('1.4.0')
     def weightedPrecision(self):
         """
         Returns weighted averaged precision.
         """
         return self.call("weightedPrecision")
 
+    @since('1.4.0')
     def weightedFMeasure(self, beta=None):
         """
         Returns weighted averaged f-measure.
@@ -307,6 +333,7 @@ class RankingMetrics(JavaModelWrapper):
     >>> metrics.ndcgAt(10)
     0.48...
 
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndLabels):
@@ -317,6 +344,7 @@ def __init__(self, predictionAndLabels):
         java_model = callMLlibFunc("newRankingMetrics", df._jdf)
         super(RankingMetrics, self).__init__(java_model)
 
+    @since('1.4.0')
     def precisionAt(self, k):
         """
         Compute the average precision of all the queries, truncated at ranking position k.
@@ -331,6 +359,7 @@ def precisionAt(self, k):
         return self.call("precisionAt", int(k))
 
     @property
+    @since('1.4.0')
     def meanAveragePrecision(self):
         """
         Returns the mean average precision (MAP) of all the queries.
@@ -339,6 +368,7 @@ def meanAveragePrecision(self):
         """
         return self.call("meanAveragePrecision")
 
+    @since('1.4.0')
     def ndcgAt(self, k):
         """
         Compute the average NDCG value of all the queries, truncated at ranking position k.
@@ -388,6 +418,8 @@ class MultilabelMetrics(JavaModelWrapper):
     0.28...
     >>> metrics.accuracy
     0.54...
+
+    .. versionadded:: 1.4.0
     """
 
     def __init__(self, predictionAndLabels):
@@ -399,6 +431,7 @@ def __init__(self, predictionAndLabels):
         java_model = java_class(df._jdf)
         super(MultilabelMetrics, self).__init__(java_model)
 
+    @since('1.4.0')
     def precision(self, label=None):
         """
         Returns precision or precision for a given label (category) if specified.
@@ -408,6 +441,7 @@ def precision(self, label=None):
         else:
             return self.call("precision", float(label))
 
+    @since('1.4.0')
     def recall(self, label=None):
         """
         Returns recall or recall for a given label (category) if specified.
@@ -417,6 +451,7 @@ def recall(self, label=None):
         else:
             return self.call("recall", float(label))
 
+    @since('1.4.0')
     def f1Measure(self, label=None):
         """
         Returns f1Measure or f1Measure for a given label (category) if specified.
@@ -427,6 +462,7 @@ def f1Measure(self, label=None):
             return self.call("f1Measure", float(label))
 
     @property
+    @since('1.4.0')
     def microPrecision(self):
         """
         Returns micro-averaged label-based precision.
@@ -435,6 +471,7 @@ def microPrecision(self):
         return self.call("microPrecision")
 
     @property
+    @since('1.4.0')
     def microRecall(self):
         """
         Returns micro-averaged label-based recall.
@@ -443,6 +480,7 @@ def microRecall(self):
         return self.call("microRecall")
 
     @property
+    @since('1.4.0')
     def microF1Measure(self):
         """
         Returns micro-averaged label-based f1-measure.
@@ -451,6 +489,7 @@ def microF1Measure(self):
         return self.call("microF1Measure")
 
     @property
+    @since('1.4.0')
     def hammingLoss(self):
         """
         Returns Hamming-loss.
@@ -458,6 +497,7 @@ def hammingLoss(self):
         return self.call("hammingLoss")
 
     @property
+    @since('1.4.0')
     def subsetAccuracy(self):
         """
         Returns subset accuracy.
@@ -466,6 +506,7 @@ def subsetAccuracy(self):
         return self.call("subsetAccuracy")
 
     @property
+    @since('1.4.0')
     def accuracy(self):
         """
         Returns accuracy.

From 9f49895fefc294ef40b2e974f1f8b311087c54df Mon Sep 17 00:00:00 2001
From: Tijo Thomas <tijoparacka@gmail.com>
Date: Tue, 20 Oct 2015 16:13:34 -0700
Subject: [PATCH 119/139] [SPARK-10261][DOCUMENTATION, ML] Fixed @Since
 annotation to ml.evaluation

Author: Tijo Thomas <tijoparacka@gmail.com>
Author: tijo <tijo@ezzoft.com>

Closes #8554 from tijoparacka/SPARK-10261-2.
---
 .../BinaryClassificationEvaluator.scala           | 15 +++++++++++++--
 .../apache/spark/ml/evaluation/Evaluator.scala    |  7 ++++++-
 .../MulticlassClassificationEvaluator.scala       | 14 ++++++++++++--
 .../spark/ml/evaluation/RegressionEvaluator.scala | 14 ++++++++++++--
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 08df2919a8a87..1fe3abaca81c3 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
@@ -30,10 +30,12 @@ import org.apache.spark.sql.types.DoubleType
  * :: Experimental ::
  * Evaluator for binary classification, which expects two input columns: rawPrediction and label.
  */
+@Since("1.2.0")
 @Experimental
-class BinaryClassificationEvaluator(override val uid: String)
+class BinaryClassificationEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   extends Evaluator with HasRawPredictionCol with HasLabelCol {
 
+  @Since("1.2.0")
   def this() = this(Identifiable.randomUID("binEval"))
 
   /**
@@ -41,6 +43,7 @@ class BinaryClassificationEvaluator(override val uid: String)
    * Default: areaUnderROC
    * @group param
    */
+  @Since("1.2.0")
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("areaUnderROC", "areaUnderPR"))
     new Param(
@@ -48,12 +51,15 @@ class BinaryClassificationEvaluator(override val uid: String)
   }
 
   /** @group getParam */
+  @Since("1.2.0")
   def getMetricName: String = $(metricName)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setRawPredictionCol(value: String): this.type = set(rawPredictionCol, value)
 
   /**
@@ -61,13 +67,16 @@ class BinaryClassificationEvaluator(override val uid: String)
    * @deprecated use [[setRawPredictionCol()]] instead
    */
   @deprecated("use setRawPredictionCol instead", "1.5.0")
+  @Since("1.2.0")
   def setScoreCol(value: String): this.type = set(rawPredictionCol, value)
 
   /** @group setParam */
+  @Since("1.2.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   setDefault(metricName -> "areaUnderROC")
 
+  @Since("1.2.0")
   override def evaluate(dataset: DataFrame): Double = {
     val schema = dataset.schema
     SchemaUtils.checkColumnType(schema, $(rawPredictionCol), new VectorUDT)
@@ -87,10 +96,12 @@ class BinaryClassificationEvaluator(override val uid: String)
     metric
   }
 
+  @Since("1.5.0")
   override def isLargerBetter: Boolean = $(metricName) match {
     case "areaUnderROC" => true
     case "areaUnderPR" => true
   }
 
+  @Since("1.4.1")
   override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index 13bd3307f8a2f..0f22cca3a78d1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.annotation.{DeveloperApi, Since}
 import org.apache.spark.ml.param.{ParamMap, Params}
 import org.apache.spark.sql.DataFrame
 
@@ -25,6 +25,7 @@ import org.apache.spark.sql.DataFrame
  * :: DeveloperApi ::
  * Abstract class for evaluators that compute metrics from predictions.
  */
+@Since("1.5.0")
 @DeveloperApi
 abstract class Evaluator extends Params {
 
@@ -35,6 +36,7 @@ abstract class Evaluator extends Params {
    * @param paramMap parameter map that specifies the input columns and output metrics
    * @return metric
    */
+  @Since("1.5.0")
   def evaluate(dataset: DataFrame, paramMap: ParamMap): Double = {
     this.copy(paramMap).evaluate(dataset)
   }
@@ -44,6 +46,7 @@ abstract class Evaluator extends Params {
    * @param dataset a dataset that contains labels/observations and predictions.
    * @return metric
    */
+  @Since("1.5.0")
   def evaluate(dataset: DataFrame): Double
 
   /**
@@ -51,7 +54,9 @@ abstract class Evaluator extends Params {
    * or minimized (false).
    * A given evaluator may support multiple metrics which may be maximized or minimized.
    */
+  @Since("1.5.0")
   def isLargerBetter: Boolean = true
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): Evaluator
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
index f73d2345078e6..df5f04ca5a8d9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param}
 import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
 import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
@@ -29,10 +29,12 @@ import org.apache.spark.sql.types.DoubleType
  * :: Experimental ::
  * Evaluator for multiclass classification, which expects two input columns: score and label.
  */
+@Since("1.5.0")
 @Experimental
-class MulticlassClassificationEvaluator (override val uid: String)
+class MulticlassClassificationEvaluator @Since("1.5.0") (@Since("1.5.0") override val uid: String)
   extends Evaluator with HasPredictionCol with HasLabelCol {
 
+  @Since("1.5.0")
   def this() = this(Identifiable.randomUID("mcEval"))
 
   /**
@@ -40,6 +42,7 @@ class MulticlassClassificationEvaluator (override val uid: String)
    * `"weightedPrecision"`, `"weightedRecall"`)
    * @group param
    */
+  @Since("1.5.0")
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("f1", "precision",
       "recall", "weightedPrecision", "weightedRecall"))
@@ -48,19 +51,24 @@ class MulticlassClassificationEvaluator (override val uid: String)
   }
 
   /** @group getParam */
+  @Since("1.5.0")
   def getMetricName: String = $(metricName)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
   /** @group setParam */
+  @Since("1.5.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   setDefault(metricName -> "f1")
 
+  @Since("1.5.0")
   override def evaluate(dataset: DataFrame): Double = {
     val schema = dataset.schema
     SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
@@ -81,6 +89,7 @@ class MulticlassClassificationEvaluator (override val uid: String)
     metric
   }
 
+  @Since("1.5.0")
   override def isLargerBetter: Boolean = $(metricName) match {
     case "f1" => true
     case "precision" => true
@@ -89,5 +98,6 @@ class MulticlassClassificationEvaluator (override val uid: String)
     case "weightedRecall" => true
   }
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
index d21c88ab9b109..3fd34d8571017 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.evaluation
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
 import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
@@ -29,10 +29,12 @@ import org.apache.spark.sql.types.DoubleType
  * :: Experimental ::
  * Evaluator for regression, which expects two input columns: prediction and label.
  */
+@Since("1.4.0")
 @Experimental
-final class RegressionEvaluator(override val uid: String)
+final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   extends Evaluator with HasPredictionCol with HasLabelCol {
 
+  @Since("1.4.0")
   def this() = this(Identifiable.randomUID("regEval"))
 
   /**
@@ -43,25 +45,31 @@ final class RegressionEvaluator(override val uid: String)
    * we take and output the negative of this metric.
    * @group param
    */
+  @Since("1.4.0")
   val metricName: Param[String] = {
     val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae"))
     new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams)
   }
 
   /** @group getParam */
+  @Since("1.4.0")
   def getMetricName: String = $(metricName)
 
   /** @group setParam */
+  @Since("1.4.0")
   def setMetricName(value: String): this.type = set(metricName, value)
 
   /** @group setParam */
+  @Since("1.4.0")
   def setPredictionCol(value: String): this.type = set(predictionCol, value)
 
   /** @group setParam */
+  @Since("1.4.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)
 
   setDefault(metricName -> "rmse")
 
+  @Since("1.4.0")
   override def evaluate(dataset: DataFrame): Double = {
     val schema = dataset.schema
     SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
@@ -81,6 +89,7 @@ final class RegressionEvaluator(override val uid: String)
     metric
   }
 
+  @Since("1.4.0")
   override def isLargerBetter: Boolean = $(metricName) match {
     case "rmse" => false
     case "mse" => false
@@ -88,5 +97,6 @@ final class RegressionEvaluator(override val uid: String)
     case "mae" => false
   }
 
+  @Since("1.5.0")
   override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
 }

From 04521ea067d6ed3c5398067f07904d27c77017ff Mon Sep 17 00:00:00 2001
From: noelsmith <mail@noelsmith.com>
Date: Tue, 20 Oct 2015 16:14:20 -0700
Subject: [PATCH 120/139] [SPARK-10269][PYSPARK][MLLIB] Add @since annotation
 to pyspark.mllib.classification

Duplicated the since decorator from pyspark.sql into pyspark (also tweaked to handle functions without docstrings).

Added since to methods + "versionadded::" to classes derived from the file history.

Note - some methods are inherited from the regression module (i.e. LinearModel.intercept) so these won't have version numbers in the API docs until that model is updated.

Author: noelsmith <mail@noelsmith.com>

Closes #8626 from noel-smith/SPARK-10269-since-mlib-classification.
---
 python/pyspark/mllib/classification.py | 70 ++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/mllib/classification.py b/python/pyspark/mllib/classification.py
index b77754500bded..aab4015ba80f8 100644
--- a/python/pyspark/mllib/classification.py
+++ b/python/pyspark/mllib/classification.py
@@ -20,7 +20,7 @@
 import numpy
 from numpy import array
 
-from pyspark import RDD
+from pyspark import RDD, since
 from pyspark.streaming import DStream
 from pyspark.mllib.common import callMLlibFunc, _py2java, _java2py
 from pyspark.mllib.linalg import DenseVector, SparseVector, _convert_to_vector
@@ -44,6 +44,7 @@ def __init__(self, weights, intercept):
         super(LinearClassificationModel, self).__init__(weights, intercept)
         self._threshold = None
 
+    @since('1.4.0')
     def setThreshold(self, value):
         """
         .. note:: Experimental
@@ -57,6 +58,7 @@ def setThreshold(self, value):
         self._threshold = value
 
     @property
+    @since('1.4.0')
     def threshold(self):
         """
         .. note:: Experimental
@@ -67,6 +69,7 @@ def threshold(self):
         """
         return self._threshold
 
+    @since('1.4.0')
     def clearThreshold(self):
         """
         .. note:: Experimental
@@ -76,6 +79,7 @@ def clearThreshold(self):
         """
         self._threshold = None
 
+    @since('1.4.0')
     def predict(self, test):
         """
         Predict values for a single data point or an RDD of points
@@ -157,6 +161,8 @@ class LogisticRegressionModel(LinearClassificationModel):
     1
     >>> mcm.predict([0.0, 0.0, 0.3])
     2
+
+    .. versionadded:: 0.9.0
     """
     def __init__(self, weights, intercept, numFeatures, numClasses):
         super(LogisticRegressionModel, self).__init__(weights, intercept)
@@ -172,13 +178,23 @@ def __init__(self, weights, intercept, numFeatures, numClasses):
                                                                 self._dataWithBiasSize)
 
     @property
+    @since('1.4.0')
     def numFeatures(self):
+        """
+        Dimension of the features.
+        """
         return self._numFeatures
 
     @property
+    @since('1.4.0')
     def numClasses(self):
+        """
+        Number of possible outcomes for k classes classification problem in Multinomial
+        Logistic Regression.
+        """
         return self._numClasses
 
+    @since('0.9.0')
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points
@@ -217,13 +233,21 @@ def predict(self, x):
                         best_class = i + 1
             return best_class
 
+    @since('1.4.0')
     def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(
             _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.4.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load(
             sc._jsc.sc(), path)
         weights = _java2py(sc, java_model.weights())
@@ -237,8 +261,11 @@ def load(cls, sc, path):
 
 
 class LogisticRegressionWithSGD(object):
-
+    """
+    .. versionadded:: 0.9.0
+    """
     @classmethod
+    @since('0.9.0')
     def train(cls, data, iterations=100, step=1.0, miniBatchFraction=1.0,
               initialWeights=None, regParam=0.01, regType="l2", intercept=False,
               validateData=True, convergenceTol=0.001):
@@ -286,8 +313,11 @@ def train(rdd, i):
 
 
 class LogisticRegressionWithLBFGS(object):
-
+    """
+    .. versionadded:: 1.2.0
+    """
     @classmethod
+    @since('1.2.0')
     def train(cls, data, iterations=100, initialWeights=None, regParam=0.01, regType="l2",
               intercept=False, corrections=10, tolerance=1e-4, validateData=True, numClasses=2):
         """
@@ -399,11 +429,14 @@ class SVMModel(LinearClassificationModel):
     ...    rmtree(path)
     ... except:
     ...    pass
+
+    .. versionadded:: 0.9.0
     """
     def __init__(self, weights, intercept):
         super(SVMModel, self).__init__(weights, intercept)
         self._threshold = 0.0
 
+    @since('0.9.0')
     def predict(self, x):
         """
         Predict values for a single data point or an RDD of points
@@ -419,13 +452,21 @@ def predict(self, x):
         else:
             return 1 if margin > self._threshold else 0
 
+    @since('1.4.0')
     def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel(
             _py2java(sc, self._coeff), self.intercept)
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.4.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(
             sc._jsc.sc(), path)
         weights = _java2py(sc, java_model.weights())
@@ -437,8 +478,12 @@ def load(cls, sc, path):
 
 
 class SVMWithSGD(object):
+    """
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since('0.9.0')
     def train(cls, data, iterations=100, step=1.0, regParam=0.01,
               miniBatchFraction=1.0, initialWeights=None, regType="l2",
               intercept=False, validateData=True, convergenceTol=0.001):
@@ -530,13 +575,15 @@ class NaiveBayesModel(Saveable, Loader):
     ...     rmtree(path)
     ... except OSError:
     ...     pass
-    """
 
+    .. versionadded:: 0.9.0
+    """
     def __init__(self, labels, pi, theta):
         self.labels = labels
         self.pi = pi
         self.theta = theta
 
+    @since('0.9.0')
     def predict(self, x):
         """
         Return the most likely class for a data vector
@@ -548,6 +595,9 @@ def predict(self, x):
         return self.labels[numpy.argmax(self.pi + x.dot(self.theta.transpose()))]
 
     def save(self, sc, path):
+        """
+        Save this model to the given path.
+        """
         java_labels = _py2java(sc, self.labels.tolist())
         java_pi = _py2java(sc, self.pi.tolist())
         java_theta = _py2java(sc, self.theta.tolist())
@@ -556,7 +606,11 @@ def save(self, sc, path):
         java_model.save(sc._jsc.sc(), path)
 
     @classmethod
+    @since('1.4.0')
     def load(cls, sc, path):
+        """
+        Load a model from the given path.
+        """
         java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
             sc._jsc.sc(), path)
         # Can not unpickle array.array from Pyrolite in Python3 with "bytes"
@@ -567,8 +621,12 @@ def load(cls, sc, path):
 
 
 class NaiveBayes(object):
+    """
+    .. versionadded:: 0.9.0
+    """
 
     @classmethod
+    @since('0.9.0')
     def train(cls, data, lambda_=1.0):
         """
         Train a Naive Bayes model given an RDD of (label, features)
@@ -605,6 +663,8 @@ class StreamingLogisticRegressionWithSGD(StreamingLinearAlgorithm):
                               iteration.
     :param regParam: L2 Regularization parameter.
     :param convergenceTol: A condition which decides iteration termination.
+
+    .. versionadded:: 1.5.0
     """
     def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regParam=0.01,
                  convergenceTol=0.001):
@@ -617,6 +677,7 @@ def __init__(self, stepSize=0.1, numIterations=50, miniBatchFraction=1.0, regPar
         super(StreamingLogisticRegressionWithSGD, self).__init__(
             model=self._model)
 
+    @since('1.5.0')
     def setInitialWeights(self, initialWeights):
         """
         Set the initial value of weights.
@@ -630,6 +691,7 @@ def setInitialWeights(self, initialWeights):
             initialWeights, 0, initialWeights.size, 2)
         return self
 
+    @since('1.5.0')
     def trainOn(self, dstream):
         """Train the model on the incoming dstream."""
         self._validate(dstream)

From da46b77afd13df8eb696e4612224ae29cc198c0b Mon Sep 17 00:00:00 2001
From: MechCoder <manojkumarsivaraj334@gmail.com>
Date: Tue, 20 Oct 2015 16:35:34 -0700
Subject: [PATCH 121/139] [SPARK-10082][MLLIB] Validate i, j in apply
 DenseMatrices and SparseMatrices

Given row_ind should be less than the number of rows
Given col_ind should be less than the number of cols.

The current code in master gives unpredictable behavior for such cases.

Author: MechCoder <manojkumarsivaraj334@gmail.com>

Closes #8271 from MechCoder/hash_code_matrices.
---
 .../org/apache/spark/mllib/linalg/Matrices.scala      |  4 ++++
 .../org/apache/spark/mllib/linalg/MatricesSuite.scala | 11 +++++++++++
 2 files changed, 15 insertions(+)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index c02ba426fcc3a..cfed9ad0730bb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -296,6 +296,8 @@ class DenseMatrix @Since("1.3.0") (
   override def apply(i: Int, j: Int): Double = values(index(i, j))
 
   private[mllib] def index(i: Int, j: Int): Int = {
+    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
+    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
     if (!isTransposed) i + numRows * j else j + numCols * i
   }
 
@@ -570,6 +572,8 @@ class SparseMatrix @Since("1.3.0") (
   }
 
   private[mllib] def index(i: Int, j: Int): Int = {
+    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
+    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
     if (!isTransposed) {
       Arrays.binarySearch(rowIndices, colPtrs(j), colPtrs(j + 1), i)
     } else {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index bfd6d5495f5e0..b0071c9a028a2 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -74,6 +74,17 @@ class MatricesSuite extends SparkFunSuite {
     }
   }
 
+  test("index in matrices incorrect input") {
+    val sm = Matrices.sparse(3, 2, Array(0, 2, 3), Array(1, 2, 1), Array(0.0, 1.0, 2.0))
+    val dm = Matrices.dense(3, 2, Array(0.0, 2.3, 1.4, 3.2, 1.0, 9.1))
+    Array(sm, dm).foreach { mat =>
+        intercept[IllegalArgumentException] { mat.index(4, 1) }
+        intercept[IllegalArgumentException] { mat.index(1, 4) }
+        intercept[IllegalArgumentException] { mat.index(-1, 2) }
+        intercept[IllegalArgumentException] { mat.index(1, -2) }
+    }
+  }
+
   test("equals") {
     val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0))
     assert(dm1 === dm1)

From aea7142c9802d1e855443c01621ebc8d57be8c5e Mon Sep 17 00:00:00 2001
From: Holden Karau <holden@pigscanfly.ca>
Date: Tue, 20 Oct 2015 16:51:32 -0700
Subject: [PATCH 122/139] [SPARK-10767][PYSPARK] Make pyspark shared params
 codegen more consistent

Namely "." shows up in some places in the template when using the param docstring and not in others

Author: Holden Karau <holden@pigscanfly.ca>

Closes #9017 from holdenk/SPARK-10767-Make-pyspark-shared-params-codegen-more-consistent.
---
 .../ml/param/_shared_params_code_gen.py       | 28 +++---
 python/pyspark/ml/param/shared.py             | 94 +++++++++----------
 python/pyspark/ml/tests.py                    |  8 +-
 3 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
index 45a94e9c32962..7143d56330bd6 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -47,7 +47,7 @@ def _gen_param_header(name, doc, defaultValueStr):
     """
     template = '''class Has$Name(Params):
     """
-    Mixin for param $name: $doc.
+    Mixin for param $name: $doc
     """
 
     # a placeholder to make it appear in the generated doc
@@ -105,22 +105,22 @@ def get$Name(self):
     print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n")
     print("from pyspark.ml.param import Param, Params\n\n")
     shared = [
-        ("maxIter", "max number of iterations (>= 0)", None),
-        ("regParam", "regularization parameter (>= 0)", None),
-        ("featuresCol", "features column name", "'features'"),
-        ("labelCol", "label column name", "'label'"),
-        ("predictionCol", "prediction column name", "'prediction'"),
+        ("maxIter", "max number of iterations (>= 0).", None),
+        ("regParam", "regularization parameter (>= 0).", None),
+        ("featuresCol", "features column name.", "'features'"),
+        ("labelCol", "label column name.", "'label'"),
+        ("predictionCol", "prediction column name.", "'prediction'"),
         ("probabilityCol", "Column name for predicted class conditional probabilities. " +
          "Note: Not all models output well-calibrated probability estimates! These probabilities " +
          "should be treated as confidences, not precise probabilities.", "'probability'"),
-        ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name", "'rawPrediction'"),
-        ("inputCol", "input column name", None),
-        ("inputCols", "input column names", None),
-        ("outputCol", "output column name", "self.uid + '__output'"),
-        ("numFeatures", "number of features", None),
-        ("checkpointInterval", "checkpoint interval (>= 1)", None),
-        ("seed", "random seed", "hash(type(self).__name__)"),
-        ("tol", "the convergence tolerance for iterative algorithms", None),
+        ("rawPredictionCol", "raw prediction (a.k.a. confidence) column name.", "'rawPrediction'"),
+        ("inputCol", "input column name.", None),
+        ("inputCols", "input column names.", None),
+        ("outputCol", "output column name.", "self.uid + '__output'"),
+        ("numFeatures", "number of features.", None),
+        ("checkpointInterval", "checkpoint interval (>= 1).", None),
+        ("seed", "random seed.", "hash(type(self).__name__)"),
+        ("tol", "the convergence tolerance for iterative algorithms.", None),
         ("stepSize", "Step size to be used for each iteration of optimization.", None),
         ("handleInvalid", "how to handle invalid entries. Options are skip (which will filter " +
          "out rows with bad values), or error (which will throw an errror). More options may be " +
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
index 8c438bc74f51f..3a58ac87d6b65 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -26,12 +26,12 @@ class HasMaxIter(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0)")
+    maxIter = Param(Params._dummy(), "maxIter", "max number of iterations (>= 0).")
 
     def __init__(self):
         super(HasMaxIter, self).__init__()
-        #: param for max number of iterations (>= 0)
-        self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0)")
+        #: param for max number of iterations (>= 0).
+        self.maxIter = Param(self, "maxIter", "max number of iterations (>= 0).")
 
     def setMaxIter(self, value):
         """
@@ -53,12 +53,12 @@ class HasRegParam(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0)")
+    regParam = Param(Params._dummy(), "regParam", "regularization parameter (>= 0).")
 
     def __init__(self):
         super(HasRegParam, self).__init__()
-        #: param for regularization parameter (>= 0)
-        self.regParam = Param(self, "regParam", "regularization parameter (>= 0)")
+        #: param for regularization parameter (>= 0).
+        self.regParam = Param(self, "regParam", "regularization parameter (>= 0).")
 
     def setRegParam(self, value):
         """
@@ -80,12 +80,12 @@ class HasFeaturesCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    featuresCol = Param(Params._dummy(), "featuresCol", "features column name")
+    featuresCol = Param(Params._dummy(), "featuresCol", "features column name.")
 
     def __init__(self):
         super(HasFeaturesCol, self).__init__()
-        #: param for features column name
-        self.featuresCol = Param(self, "featuresCol", "features column name")
+        #: param for features column name.
+        self.featuresCol = Param(self, "featuresCol", "features column name.")
         self._setDefault(featuresCol='features')
 
     def setFeaturesCol(self, value):
@@ -108,12 +108,12 @@ class HasLabelCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    labelCol = Param(Params._dummy(), "labelCol", "label column name")
+    labelCol = Param(Params._dummy(), "labelCol", "label column name.")
 
     def __init__(self):
         super(HasLabelCol, self).__init__()
-        #: param for label column name
-        self.labelCol = Param(self, "labelCol", "label column name")
+        #: param for label column name.
+        self.labelCol = Param(self, "labelCol", "label column name.")
         self._setDefault(labelCol='label')
 
     def setLabelCol(self, value):
@@ -136,12 +136,12 @@ class HasPredictionCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name")
+    predictionCol = Param(Params._dummy(), "predictionCol", "prediction column name.")
 
     def __init__(self):
         super(HasPredictionCol, self).__init__()
-        #: param for prediction column name
-        self.predictionCol = Param(self, "predictionCol", "prediction column name")
+        #: param for prediction column name.
+        self.predictionCol = Param(self, "predictionCol", "prediction column name.")
         self._setDefault(predictionCol='prediction')
 
     def setPredictionCol(self, value):
@@ -160,7 +160,7 @@ def getPredictionCol(self):
 
 class HasProbabilityCol(Params):
     """
-    Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities..
+    Mixin for param probabilityCol: Column name for predicted class conditional probabilities. Note: Not all models output well-calibrated probability estimates! These probabilities should be treated as confidences, not precise probabilities.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -192,12 +192,12 @@ class HasRawPredictionCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
+    rawPredictionCol = Param(Params._dummy(), "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.")
 
     def __init__(self):
         super(HasRawPredictionCol, self).__init__()
-        #: param for raw prediction (a.k.a. confidence) column name
-        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name")
+        #: param for raw prediction (a.k.a. confidence) column name.
+        self.rawPredictionCol = Param(self, "rawPredictionCol", "raw prediction (a.k.a. confidence) column name.")
         self._setDefault(rawPredictionCol='rawPrediction')
 
     def setRawPredictionCol(self, value):
@@ -220,12 +220,12 @@ class HasInputCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    inputCol = Param(Params._dummy(), "inputCol", "input column name")
+    inputCol = Param(Params._dummy(), "inputCol", "input column name.")
 
     def __init__(self):
         super(HasInputCol, self).__init__()
-        #: param for input column name
-        self.inputCol = Param(self, "inputCol", "input column name")
+        #: param for input column name.
+        self.inputCol = Param(self, "inputCol", "input column name.")
 
     def setInputCol(self, value):
         """
@@ -247,12 +247,12 @@ class HasInputCols(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    inputCols = Param(Params._dummy(), "inputCols", "input column names")
+    inputCols = Param(Params._dummy(), "inputCols", "input column names.")
 
     def __init__(self):
         super(HasInputCols, self).__init__()
-        #: param for input column names
-        self.inputCols = Param(self, "inputCols", "input column names")
+        #: param for input column names.
+        self.inputCols = Param(self, "inputCols", "input column names.")
 
     def setInputCols(self, value):
         """
@@ -274,12 +274,12 @@ class HasOutputCol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    outputCol = Param(Params._dummy(), "outputCol", "output column name")
+    outputCol = Param(Params._dummy(), "outputCol", "output column name.")
 
     def __init__(self):
         super(HasOutputCol, self).__init__()
-        #: param for output column name
-        self.outputCol = Param(self, "outputCol", "output column name")
+        #: param for output column name.
+        self.outputCol = Param(self, "outputCol", "output column name.")
         self._setDefault(outputCol=self.uid + '__output')
 
     def setOutputCol(self, value):
@@ -302,12 +302,12 @@ class HasNumFeatures(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    numFeatures = Param(Params._dummy(), "numFeatures", "number of features")
+    numFeatures = Param(Params._dummy(), "numFeatures", "number of features.")
 
     def __init__(self):
         super(HasNumFeatures, self).__init__()
-        #: param for number of features
-        self.numFeatures = Param(self, "numFeatures", "number of features")
+        #: param for number of features.
+        self.numFeatures = Param(self, "numFeatures", "number of features.")
 
     def setNumFeatures(self, value):
         """
@@ -329,12 +329,12 @@ class HasCheckpointInterval(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1)")
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "checkpoint interval (>= 1).")
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
-        #: param for checkpoint interval (>= 1)
-        self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1)")
+        #: param for checkpoint interval (>= 1).
+        self.checkpointInterval = Param(self, "checkpointInterval", "checkpoint interval (>= 1).")
 
     def setCheckpointInterval(self, value):
         """
@@ -356,12 +356,12 @@ class HasSeed(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    seed = Param(Params._dummy(), "seed", "random seed")
+    seed = Param(Params._dummy(), "seed", "random seed.")
 
     def __init__(self):
         super(HasSeed, self).__init__()
-        #: param for random seed
-        self.seed = Param(self, "seed", "random seed")
+        #: param for random seed.
+        self.seed = Param(self, "seed", "random seed.")
         self._setDefault(seed=hash(type(self).__name__))
 
     def setSeed(self, value):
@@ -384,12 +384,12 @@ class HasTol(Params):
     """
 
     # a placeholder to make it appear in the generated doc
-    tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms")
+    tol = Param(Params._dummy(), "tol", "the convergence tolerance for iterative algorithms.")
 
     def __init__(self):
         super(HasTol, self).__init__()
-        #: param for the convergence tolerance for iterative algorithms
-        self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms")
+        #: param for the convergence tolerance for iterative algorithms.
+        self.tol = Param(self, "tol", "the convergence tolerance for iterative algorithms.")
 
     def setTol(self, value):
         """
@@ -407,7 +407,7 @@ def getTol(self):
 
 class HasStepSize(Params):
     """
-    Mixin for param stepSize: Step size to be used for each iteration of optimization..
+    Mixin for param stepSize: Step size to be used for each iteration of optimization.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -434,7 +434,7 @@ def getStepSize(self):
 
 class HasHandleInvalid(Params):
     """
-    Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later..
+    Mixin for param handleInvalid: how to handle invalid entries. Options are skip (which will filter out rows with bad values), or error (which will throw an errror). More options may be added later.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -461,7 +461,7 @@ def getHandleInvalid(self):
 
 class HasElasticNetParam(Params):
     """
-    Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty..
+    Mixin for param elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -489,7 +489,7 @@ def getElasticNetParam(self):
 
 class HasFitIntercept(Params):
     """
-    Mixin for param fitIntercept: whether to fit an intercept term..
+    Mixin for param fitIntercept: whether to fit an intercept term.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -517,7 +517,7 @@ def getFitIntercept(self):
 
 class HasStandardization(Params):
     """
-    Mixin for param standardization: whether to standardize the training features before fitting the model..
+    Mixin for param standardization: whether to standardize the training features before fitting the model.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -545,7 +545,7 @@ def getStandardization(self):
 
 class HasThresholds(Params):
     """
-    Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold..
+    Mixin for param thresholds: Thresholds in multi-class classification to adjust the probability of predicting each class. Array must have length equal to the number of classes, with values >= 0. The class with largest value p/t is predicted, where p is the original probability of that class and t is the class' threshold.
     """
 
     # a placeholder to make it appear in the generated doc
@@ -572,7 +572,7 @@ def getThresholds(self):
 
 class HasWeightCol(Params):
     """
-    Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0..
+    Mixin for param weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0.
     """
 
     # a placeholder to make it appear in the generated doc
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 648fa8858fba3..6a2577d66f287 100644
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -163,7 +163,7 @@ def test_param(self):
         testParams = TestParams()
         maxIter = testParams.maxIter
         self.assertEqual(maxIter.name, "maxIter")
-        self.assertEqual(maxIter.doc, "max number of iterations (>= 0)")
+        self.assertEqual(maxIter.doc, "max number of iterations (>= 0).")
         self.assertTrue(maxIter.parent == testParams.uid)
 
     def test_params(self):
@@ -197,9 +197,9 @@ def test_params(self):
 
         self.assertEqual(
             testParams.explainParams(),
-            "\n".join(["inputCol: input column name (undefined)",
-                       "maxIter: max number of iterations (>= 0) (default: 10, current: 100)",
-                       "seed: random seed (default: 41, current: 43)"]))
+            "\n".join(["inputCol: input column name. (undefined)",
+                       "maxIter: max number of iterations (>= 0). (default: 10, current: 100)",
+                       "seed: random seed. (default: 41, current: 43)"]))
 
     def test_hasseed(self):
         noSeedSpecd = TestParams()

From 89e6db6150704deab46232352d1986bc1449883b Mon Sep 17 00:00:00 2001
From: Cheng Lian <lian@databricks.com>
Date: Wed, 21 Oct 2015 09:02:20 +0800
Subject: [PATCH 123/139] [SPARK-11153][SQL] Disables Parquet filter push-down
 for string and binary columns

Due to PARQUET-251, `BINARY` columns in existing Parquet files may be written with corrupted statistics information. This information is used by filter push-down optimization. Since Spark 1.5 turns on Parquet filter push-down by default, we may end up with wrong query results. PARQUET-251 has been fixed in parquet-mr 1.8.1, but Spark 1.5 is still using 1.7.0.

This affects all Spark SQL data types that can be mapped to Parquet {{BINARY}}, namely:

- `StringType`

- `BinaryType`

- `DecimalType`

  (But Spark SQL doesn't support pushing down filters involving `DecimalType` columns for now.)

To avoid wrong query results, we should disable filter push-down for columns of `StringType` and `BinaryType` until we upgrade to parquet-mr 1.8.

Author: Cheng Lian <lian@databricks.com>

Closes #9152 from liancheng/spark-11153.workaround-parquet-251.

(cherry picked from commit 0887e5e87891e8e22f534ca6d0406daf86ec2dad)
Signed-off-by: Cheng Lian <lian@databricks.com>
---
 .../datasources/parquet/ParquetFilters.scala  | 27 +++++++++++++++++++
 .../parquet/ParquetFilterSuite.scala          |  6 +++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index 78040d99fb0a5..07714329370a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -53,6 +53,8 @@ private[sql] object ParquetFilters {
     case DoubleType =>
       (n: String, v: Any) => FilterApi.eq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
 
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     // Binary.fromString and Binary.fromByteArray don't accept null values
     case StringType =>
       (n: String, v: Any) => FilterApi.eq(
@@ -62,6 +64,7 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.eq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeNotEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -75,6 +78,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.notEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
@@ -83,6 +89,7 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.notEq(
         binaryColumn(n),
         Option(v).map(b => Binary.fromByteArray(v.asInstanceOf[Array[Byte]])).orNull)
+     */
   }
 
   private val makeLt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -94,6 +101,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.lt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.lt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n),
@@ -101,6 +111,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.lt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeLtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -112,6 +123,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.ltEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.ltEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n),
@@ -119,6 +133,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.ltEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGt: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -130,6 +145,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gt(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gt(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n),
@@ -137,6 +155,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gt(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeGtEq: PartialFunction[DataType, (String, Any) => FilterPredicate] = {
@@ -148,6 +167,9 @@ private[sql] object ParquetFilters {
       (n: String, v: Any) => FilterApi.gtEq(floatColumn(n), v.asInstanceOf[java.lang.Float])
     case DoubleType =>
       (n: String, v: Any) => FilterApi.gtEq(doubleColumn(n), v.asInstanceOf[java.lang.Double])
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n),
@@ -155,6 +177,7 @@ private[sql] object ParquetFilters {
     case BinaryType =>
       (n: String, v: Any) =>
         FilterApi.gtEq(binaryColumn(n), Binary.fromByteArray(v.asInstanceOf[Array[Byte]]))
+     */
   }
 
   private val makeInSet: PartialFunction[DataType, (String, Set[Any]) => FilterPredicate] = {
@@ -170,6 +193,9 @@ private[sql] object ParquetFilters {
     case DoubleType =>
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(doubleColumn(n), SetInFilter(v.asInstanceOf[Set[java.lang.Double]]))
+
+    // See https://issues.apache.org/jira/browse/SPARK-11153
+    /*
     case StringType =>
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(binaryColumn(n),
@@ -178,6 +204,7 @@ private[sql] object ParquetFilters {
       (n: String, v: Set[Any]) =>
         FilterApi.userDefined(binaryColumn(n),
           SetInFilter(v.map(e => Binary.fromByteArray(e.asInstanceOf[Array[Byte]]))))
+     */
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 7a23f57f40392..13fdd555a4c71 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -219,7 +219,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - string") {
+  // See https://issues.apache.org/jira/browse/SPARK-11153
+  ignore("filter pushdown - string") {
     withParquetDataFrame((1 to 4).map(i => Tuple1(i.toString))) { implicit df =>
       checkFilterPredicate('_1.isNull, classOf[Eq[_]], Seq.empty[Row])
       checkFilterPredicate(
@@ -247,7 +248,8 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
     }
   }
 
-  test("filter pushdown - binary") {
+  // See https://issues.apache.org/jira/browse/SPARK-11153
+  ignore("filter pushdown - binary") {
     implicit class IntToBinary(int: Int) {
       def b: Array[Byte] = int.toString.getBytes("UTF-8")
     }

From 45861693bef2619196f0fbdf5c166ad3f9b1e8d1 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 20 Oct 2015 18:37:29 -0700
Subject: [PATCH 124/139] [SPARK-10082][MLLIB] minor style updates for matrix
 indexing after #8271

* `>=0` => `>= 0`
* print `i`, `j` in the log message

MechCoder

Author: Xiangrui Meng <meng@databricks.com>

Closes #9189 from mengxr/SPARK-10082.
---
 .../scala/org/apache/spark/mllib/linalg/Matrices.scala    | 8 ++++----
 .../org/apache/spark/mllib/linalg/MatricesSuite.scala     | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index cfed9ad0730bb..8ba6e4e78d969 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -296,8 +296,8 @@ class DenseMatrix @Since("1.3.0") (
   override def apply(i: Int, j: Int): Double = values(index(i, j))
 
   private[mllib] def index(i: Int, j: Int): Int = {
-    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
-    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
+    require(i >= 0 && i < numRows, s"Expected 0 <= i < $numRows, got i = $i.")
+    require(j >= 0 && j < numCols, s"Expected 0 <= j < $numCols, got j = $j.")
     if (!isTransposed) i + numRows * j else j + numCols * i
   }
 
@@ -572,8 +572,8 @@ class SparseMatrix @Since("1.3.0") (
   }
 
   private[mllib] def index(i: Int, j: Int): Int = {
-    require(i < numRows && i >=0, s"Expected 0 <= i < $numRows, got $i")
-    require(j < numCols && j >=0, s"Expected 0 <= j < $numCols, got $j")
+    require(i >= 0 && i < numRows, s"Expected 0 <= i < $numRows, got i = $i.")
+    require(j >= 0 && j < numCols, s"Expected 0 <= j < $numCols, got j = $j.")
     if (!isTransposed) {
       Arrays.binarySearch(rowIndices, colPtrs(j), colPtrs(j + 1), i)
     } else {
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index b0071c9a028a2..1833cf3833671 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -78,10 +78,10 @@ class MatricesSuite extends SparkFunSuite {
     val sm = Matrices.sparse(3, 2, Array(0, 2, 3), Array(1, 2, 1), Array(0.0, 1.0, 2.0))
     val dm = Matrices.dense(3, 2, Array(0.0, 2.3, 1.4, 3.2, 1.0, 9.1))
     Array(sm, dm).foreach { mat =>
-        intercept[IllegalArgumentException] { mat.index(4, 1) }
-        intercept[IllegalArgumentException] { mat.index(1, 4) }
-        intercept[IllegalArgumentException] { mat.index(-1, 2) }
-        intercept[IllegalArgumentException] { mat.index(1, -2) }
+      intercept[IllegalArgumentException] { mat.index(4, 1) }
+      intercept[IllegalArgumentException] { mat.index(1, 4) }
+      intercept[IllegalArgumentException] { mat.index(-1, 2) }
+      intercept[IllegalArgumentException] { mat.index(1, -2) }
     }
   }
 

From 135ade905089fe90d3cc9e84f8fedc7637e901d8 Mon Sep 17 00:00:00 2001
From: Xiangrui Meng <meng@databricks.com>
Date: Tue, 20 Oct 2015 18:38:06 -0700
Subject: [PATCH 125/139] [MINOR][ML] fix doc warnings

Without an empty line, sphinx will treat doctest as docstring. holdenk

~~~
/Users/meng/src/spark/python/pyspark/ml/feature.py:docstring of pyspark.ml.feature.CountVectorizer:3: ERROR: Undefined substitution referenced: "label|raw |vectors | +-----+---------------+-------------------------+ |0 |[a, b, c] |(3,[0,1,2],[1.0,1.0,1.0])".
/Users/meng/src/spark/python/pyspark/ml/feature.py:docstring of pyspark.ml.feature.CountVectorizer:3: ERROR: Undefined substitution referenced: "1 |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])".
~~~

Author: Xiangrui Meng <meng@databricks.com>

Closes #9188 from mengxr/py-count-vec-doc-fix.
---
 python/pyspark/ml/feature.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index a4e60f916b5c8..55bde6d0ea4fb 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -178,6 +178,7 @@ class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol):
     .. note:: Experimental
 
     Extracts a vocabulary from document collections and generates a :py:attr:`CountVectorizerModel`.
+
     >>> df = sqlContext.createDataFrame(
     ...    [(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
     ...    ["label", "raw"])

From 1107bd958aefc793d6e8e369ad6268b24a0f8510 Mon Sep 17 00:00:00 2001
From: felixcheung <felixcheung_m@hotmail.com>
Date: Tue, 20 Oct 2015 21:40:22 -0700
Subject: [PATCH 126/139] [SPARK-11221][SPARKR] fix R doc for lit and add
 examples

Currently the documentation for `lit` is inconsistent with doc format, references "Scala symbol" and has no example. Fixing that.
shivaram

Author: felixcheung <felixcheung_m@hotmail.com>

Closes #9187 from felixcheung/rlit.
---
 R/pkg/R/functions.R | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index a220ad8b9f58b..a72fb7bb42fef 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -18,16 +18,21 @@
 #' @include generics.R column.R
 NULL
 
-#' Creates a \code{Column} of literal value.
+#' lit
 #'
-#' The passed in object is returned directly if it is already a \linkS4class{Column}.
-#' If the object is a Scala Symbol, it is converted into a \linkS4class{Column} also.
-#' Otherwise, a new \linkS4class{Column} is created to represent the literal value.
+#' A new \linkS4class{Column} is created to represent the literal value.
+#' If the parameter is a \linkS4class{Column}, it is returned unchanged.
 #'
 #' @family normal_funcs
 #' @rdname lit
 #' @name lit
 #' @export
+#' @examples
+#' \dontrun{
+#' lit(df$name)
+#' select(df, lit("x"))
+#' select(df, lit("2015-01-01"))
+#'}
 setMethod("lit", signature("ANY"),
           function(x) {
             jc <- callJStatic("org.apache.spark.sql.functions",

From 5cdea7d1e54e751f928876ead33fc01e7e2f3437 Mon Sep 17 00:00:00 2001
From: Jeff Zhang <zjffdu@apache.org>
Date: Tue, 20 Oct 2015 23:58:27 -0700
Subject: [PATCH 127/139] =?UTF-8?q?[SPARK-11205][PYSPARK]=20Delegate=20to?=
 =?UTF-8?q?=20scala=20DataFrame=20API=20rather=20than=20p=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…rint in python

No test needed. Verify it manually in pyspark shell

Author: Jeff Zhang <zjffdu@apache.org>

Closes #9177 from zjffdu/SPARK-11205.
---
 python/pyspark/sql/dataframe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 033b31983ffac..36fc6e0611dc1 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -212,6 +212,7 @@ def explain(self, extended=False):
         :param extended: boolean, default ``False``. If ``False``, prints only the physical plan.
 
         >>> df.explain()
+        == Physical Plan ==
         Scan PhysicalRDD[age#0,name#1]
 
         >>> df.explain(True)
@@ -227,7 +228,7 @@ def explain(self, extended=False):
         if extended:
             print(self._jdf.queryExecution().toString())
         else:
-            print(self._jdf.queryExecution().executedPlan().toString())
+            print(self._jdf.queryExecution().simpleString())
 
     @since(1.3)
     def isLocal(self):

From 8e82e59834aefb74d49e45c9c9c926bb53143b4c Mon Sep 17 00:00:00 2001
From: Pravin Gadakh <prgadakh@in.ibm.com>
Date: Wed, 21 Oct 2015 10:35:09 -0700
Subject: [PATCH 128/139] [SPARK-11037][SQL] using Option instead of Some in
 JdbcDialects

Using Option instead of Some in getCatalystType method.

Author: Pravin Gadakh <prgadakh@in.ibm.com>

Closes #9195 from pravingadakh/master.
---
 .../org/apache/spark/sql/jdbc/JdbcDialects.scala | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
index a2ff4cc1c91f9..88ae83957a708 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -198,15 +198,15 @@ case object PostgresDialect extends JdbcDialect {
   override def getCatalystType(
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
     if (sqlType == Types.BIT && typeName.equals("bit") && size != 1) {
-      Some(BinaryType)
+      Option(BinaryType)
     } else if (sqlType == Types.OTHER && typeName.equals("cidr")) {
-      Some(StringType)
+      Option(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("inet")) {
-      Some(StringType)
+      Option(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("json")) {
-      Some(StringType)
+      Option(StringType)
     } else if (sqlType == Types.OTHER && typeName.equals("jsonb")) {
-      Some(StringType)
+      Option(StringType)
     } else None
   }
 
@@ -236,9 +236,9 @@ case object MySQLDialect extends JdbcDialect {
       // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
       // byte arrays instead of longs.
       md.putLong("binarylong", 1)
-      Some(LongType)
+      Option(LongType)
     } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
-      Some(BooleanType)
+      Option(BooleanType)
     } else None
   }
 
@@ -279,7 +279,7 @@ case object MsSqlServerDialect extends JdbcDialect {
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
     if (typeName.contains("datetimeoffset")) {
       // String is recommend by Microsoft SQL Server for datetimeoffset types in non-MS clients
-      Some(StringType)
+      Option(StringType)
     } else None
   }
 

From f62e3260889d67256d335fd0dd38f114ae4e3eca Mon Sep 17 00:00:00 2001
From: nitin goyal <nitin.goyal@guavus.com>
Date: Wed, 21 Oct 2015 10:45:21 -0700
Subject: [PATCH 129/139] [SPARK-11179] [SQL] Push filters through aggregate

Push conjunctive predicates though Aggregate operators when their references are a subset of the groupingExpressions.

Query plan before optimisation :-
Filter ((c#138L = 2) && (a#0 = 3))
 Aggregate [a#0], [a#0,count(b#1) AS c#138L]
  Project [a#0,b#1]
   LocalRelation [a#0,b#1,c#2]

Query plan after optimisation :-
Filter (c#138L = 2)
 Aggregate [a#0], [a#0,count(b#1) AS c#138L]
  Filter (a#0 = 3)
   Project [a#0,b#1]
    LocalRelation [a#0,b#1,c#2]

Author: nitin goyal <nitin.goyal@guavus.com>
Author: nitin.goyal <nitin.goyal@guavus.com>

Closes #9167 from nitin2goyal/master.
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 24 ++++++++++
 .../optimizer/FilterPushdownSuite.scala       | 45 +++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 6557c7005d1e1..0139b9e87ce84 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -46,6 +46,7 @@ object DefaultOptimizer extends Optimizer {
       PushPredicateThroughJoin,
       PushPredicateThroughProject,
       PushPredicateThroughGenerate,
+      PushPredicateThroughAggregate,
       ColumnPruning,
       // Operator combine
       ProjectCollapsing,
@@ -674,6 +675,29 @@ object PushPredicateThroughGenerate extends Rule[LogicalPlan] with PredicateHelp
   }
 }
 
+/**
+ * Push [[Filter]] operators through [[Aggregate]] operators. Parts of the predicate that reference
+ * attributes which are subset of group by attribute set of [[Aggregate]] will be pushed beneath,
+ * and the rest should remain above.
+ */
+object PushPredicateThroughAggregate extends Rule[LogicalPlan] with PredicateHelper {
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case filter @ Filter(condition,
+        aggregate @ Aggregate(groupingExpressions, aggregateExpressions, grandChild)) =>
+      val (pushDown, stayUp) = splitConjunctivePredicates(condition).partition {
+        conjunct => conjunct.references subsetOf AttributeSet(groupingExpressions)
+      }
+      if (pushDown.nonEmpty) {
+        val pushDownPredicate = pushDown.reduce(And)
+        val withPushdown = aggregate.copy(child = Filter(pushDownPredicate, grandChild))
+        stayUp.reduceOption(And).map(Filter(_, withPushdown)).getOrElse(withPushdown)
+      } else {
+        filter
+      }
+  }
+}
+
 /**
  * Pushes down [[Filter]] operators where the `condition` can be
  * evaluated using only the attributes of the left or right side of a join.  Other
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
index 0f1fde2fb0f67..ed810a12808f0 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/FilterPushdownSuite.scala
@@ -40,6 +40,7 @@ class FilterPushdownSuite extends PlanTest {
         BooleanSimplification,
         PushPredicateThroughJoin,
         PushPredicateThroughGenerate,
+        PushPredicateThroughAggregate,
         ColumnPruning,
         ProjectCollapsing) :: Nil
   }
@@ -652,4 +653,48 @@ class FilterPushdownSuite extends PlanTest {
 
     comparePlans(optimized, correctAnswer.analyze)
   }
+
+  test("aggregate: push down filter when filter on group by expression") {
+    val originalQuery = testRelation
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .select('a, 'c)
+                        .where('a === 2)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = testRelation
+                        .where('a === 2)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .analyze
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("aggregate: don't push down filter when filter not on group by expression") {
+    val originalQuery = testRelation
+                        .select('a, 'b)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .where('c === 2L)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    comparePlans(optimized, originalQuery.analyze)
+  }
+
+  test("aggregate: push down filters partially which are subset of group by expressions") {
+    val originalQuery = testRelation
+                        .select('a, 'b)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .where('c === 2L && 'a === 3)
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+
+    val correctAnswer = testRelation
+                        .select('a, 'b)
+                        .where('a === 3)
+                        .groupBy('a)('a, Count('b) as 'c)
+                        .where('c === 2L)
+                        .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
 }

From ccf536f903ef1f81fb3e1b6ce781d5e40d0ae3e0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 21 Oct 2015 11:06:34 -0700
Subject: [PATCH 130/139] [SPARK-11216] [SQL] add encoder/decoder for external
 row

Implement encode/decode for external row based on `ClassEncoder`.

TODO:
* code cleanup
* ~~fix corner cases~~
* refactor the encoder interface
* improve test for product codegen, to cover more corner cases.

Author: Wenchen Fan <wenchen@databricks.com>

Closes #9184 from cloud-fan/encoder.
---
 .../spark/sql/catalyst/ScalaReflection.scala  |   6 +-
 .../sql/catalyst/encoders/ClassEncoder.scala  |  75 ++++++
 .../spark/sql/catalyst/encoders/Encoder.scala |   2 +-
 .../catalyst/encoders/ProductEncoder.scala    |  46 +---
 .../sql/catalyst/encoders/RowEncoder.scala    | 234 ++++++++++++++++++
 .../sql/catalyst/expressions/objects.scala    |  46 +++-
 .../spark/sql/types/ArrayBasedMapData.scala   |   4 +
 .../spark/sql/RandomDataGenerator.scala       |   4 +-
 .../catalyst/encoders/RowEncoderSuite.scala   |  96 +++++++
 9 files changed, 459 insertions(+), 54 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
index 8edd6498e5163..27c96f41221ad 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala
@@ -19,11 +19,11 @@ package org.apache.spark.sql.catalyst
 
 import org.apache.spark.sql.catalyst.analysis.{UnresolvedExtractValue, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.unsafe.types.UTF8String
-import org.apache.spark.util.Utils
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+import org.apache.spark.util.Utils
 
 /**
  * A default version of ScalaReflection that uses the runtime universe.
@@ -142,7 +142,7 @@ trait ScalaReflection {
   }
 
   /**
-   * Returns an expression that can be used to construct an object of type `T` given a an input
+   * Returns an expression that can be used to construct an object of type `T` given an input
    * row with a compatible schema.  Fields of the row will be extracted using UnresolvedAttributes
    * of the same name as the constructor arguments.  Nested classes will have their fields accessed
    * using UnresolvedExtractValue.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
new file mode 100644
index 0000000000000..f3a1063871775
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ClassEncoder.scala
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
+import org.apache.spark.sql.types.{ObjectType, StructType}
+
+/**
+ * A generic encoder for JVM objects.
+ *
+ * @param schema The schema after converting `T` to a Spark SQL row.
+ * @param extractExpressions A set of expressions, one for each top-level field that can be used to
+ *                           extract the values from a raw object.
+ * @param clsTag A classtag for `T`.
+ */
+case class ClassEncoder[T](
+    schema: StructType,
+    extractExpressions: Seq[Expression],
+    constructExpression: Expression,
+    clsTag: ClassTag[T])
+  extends Encoder[T] {
+
+  private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
+  private val inputRow = new GenericMutableRow(1)
+
+  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
+  private val dataType = ObjectType(clsTag.runtimeClass)
+
+  override def toRow(t: T): InternalRow = {
+    if (t == null) {
+      null
+    } else {
+      inputRow(0) = t
+      extractProjection(inputRow)
+    }
+  }
+
+  override def fromRow(row: InternalRow): T = {
+    if (row eq null) {
+      null.asInstanceOf[T]
+    } else {
+      constructProjection(row).get(0, dataType).asInstanceOf[T]
+    }
+  }
+
+  override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
+    val plan = Project(Alias(constructExpression, "object")() :: Nil, LocalRelation(schema))
+    val analyzedPlan = SimpleAnalyzer.execute(plan)
+    val resolvedExpression = analyzedPlan.expressions.head.children.head
+    val boundExpression = BindReferences.bindReference(resolvedExpression, schema)
+
+    copy(constructExpression = boundExpression)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
index 3618247d5d51a..bdb1c0959da87 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/Encoder.scala
@@ -46,7 +46,7 @@ trait Encoder[T] {
 
   /**
    * Returns an object of type `T`, extracting the required values from the provided row.  Note that
-   * you must bind` and encoder to a specific schema before you can call this function.
+   * you must bind the encoder to a specific schema before you can call this function.
    */
   def fromRow(row: InternalRow): T
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
index b0381880c3bdb..4f7ce455ada99 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ProductEncoder.scala
@@ -17,15 +17,11 @@
 
 package org.apache.spark.sql.catalyst.encoders
 
-import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
-import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection, GenerateUnsafeProjection}
-import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
-
 import scala.reflect.ClassTag
 import scala.reflect.runtime.universe.{typeTag, TypeTag}
 
-import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.types.{ObjectType, StructType}
 
 /**
@@ -44,44 +40,6 @@ object ProductEncoder {
     val constructExpression = ScalaReflection.constructorFor[T]
     new ClassEncoder[T](schema, extractExpressions, constructExpression, ClassTag[T](cls))
   }
-}
-
-/**
- * A generic encoder for JVM objects.
- *
- * @param schema The schema after converting `T` to a Spark SQL row.
- * @param extractExpressions A set of expressions, one for each top-level field that can be used to
- *                           extract the values from a raw object.
- * @param clsTag A classtag for `T`.
- */
-case class ClassEncoder[T](
-    schema: StructType,
-    extractExpressions: Seq[Expression],
-    constructExpression: Expression,
-    clsTag: ClassTag[T])
-  extends Encoder[T] {
 
-  private val extractProjection = GenerateUnsafeProjection.generate(extractExpressions)
-  private val inputRow = new GenericMutableRow(1)
 
-  private lazy val constructProjection = GenerateSafeProjection.generate(constructExpression :: Nil)
-  private val dataType = ObjectType(clsTag.runtimeClass)
-
-  override def toRow(t: T): InternalRow = {
-    inputRow(0) = t
-    extractProjection(inputRow)
-  }
-
-  override def fromRow(row: InternalRow): T = {
-    constructProjection(row).get(0, dataType).asInstanceOf[T]
-  }
-
-  override def bind(schema: Seq[Attribute]): ClassEncoder[T] = {
-    val plan = Project(Alias(constructExpression, "object")() :: Nil, LocalRelation(schema))
-    val analyzedPlan = SimpleAnalyzer.execute(plan)
-    val resolvedExpression = analyzedPlan.expressions.head.children.head
-    val boundExpression = BindReferences.bindReference(resolvedExpression, schema)
-
-    copy(constructExpression = boundExpression)
-  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
new file mode 100644
index 0000000000000..3e74aabd078df
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import scala.collection.Map
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+object RowEncoder {
+
+  def apply(schema: StructType): ClassEncoder[Row] = {
+    val cls = classOf[Row]
+    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
+    val extractExpressions = extractorsFor(inputObject, schema)
+    val constructExpression = constructorFor(schema)
+    new ClassEncoder[Row](
+      schema,
+      extractExpressions.asInstanceOf[CreateStruct].children,
+      constructExpression,
+      ClassTag(cls))
+  }
+
+  private def extractorsFor(
+      inputObject: Expression,
+      inputType: DataType): Expression = inputType match {
+    case BooleanType | ByteType | ShortType | IntegerType | LongType |
+         FloatType | DoubleType | BinaryType => inputObject
+
+    case TimestampType =>
+      StaticInvoke(
+        DateTimeUtils,
+        TimestampType,
+        "fromJavaTimestamp",
+        inputObject :: Nil)
+
+    case DateType =>
+      StaticInvoke(
+        DateTimeUtils,
+        DateType,
+        "fromJavaDate",
+        inputObject :: Nil)
+
+    case _: DecimalType =>
+      StaticInvoke(
+        Decimal,
+        DecimalType.SYSTEM_DEFAULT,
+        "apply",
+        inputObject :: Nil)
+
+    case StringType =>
+      StaticInvoke(
+        classOf[UTF8String],
+        StringType,
+        "fromString",
+        inputObject :: Nil)
+
+    case t @ ArrayType(et, _) => et match {
+      case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
+        NewInstance(
+          classOf[GenericArrayData],
+          inputObject :: Nil,
+          dataType = t)
+      case _ => MapObjects(extractorsFor(_, et), inputObject, externalDataTypeFor(et))
+    }
+
+    case t @ MapType(kt, vt, valueNullable) =>
+      val keys =
+        Invoke(
+          Invoke(inputObject, "keysIterator", ObjectType(classOf[scala.collection.Iterator[_]])),
+          "toSeq",
+          ObjectType(classOf[scala.collection.Seq[_]]))
+      val convertedKeys = extractorsFor(keys, ArrayType(kt, false))
+
+      val values =
+        Invoke(
+          Invoke(inputObject, "valuesIterator", ObjectType(classOf[scala.collection.Iterator[_]])),
+          "toSeq",
+          ObjectType(classOf[scala.collection.Seq[_]]))
+      val convertedValues = extractorsFor(values, ArrayType(vt, valueNullable))
+
+      NewInstance(
+        classOf[ArrayBasedMapData],
+        convertedKeys :: convertedValues :: Nil,
+        dataType = t)
+
+    case StructType(fields) =>
+      val convertedFields = fields.zipWithIndex.map { case (f, i) =>
+        If(
+          Invoke(inputObject, "isNullAt", BooleanType, Literal(i) :: Nil),
+          Literal.create(null, f.dataType),
+          extractorsFor(
+            Invoke(inputObject, "get", externalDataTypeFor(f.dataType), Literal(i) :: Nil),
+            f.dataType))
+      }
+      CreateStruct(convertedFields)
+  }
+
+  private def externalDataTypeFor(dt: DataType): DataType = dt match {
+    case BooleanType | ByteType | ShortType | IntegerType | LongType |
+         FloatType | DoubleType | BinaryType => dt
+    case TimestampType => ObjectType(classOf[java.sql.Timestamp])
+    case DateType => ObjectType(classOf[java.sql.Date])
+    case _: DecimalType => ObjectType(classOf[java.math.BigDecimal])
+    case StringType => ObjectType(classOf[java.lang.String])
+    case _: ArrayType => ObjectType(classOf[scala.collection.Seq[_]])
+    case _: MapType => ObjectType(classOf[scala.collection.Map[_, _]])
+    case _: StructType => ObjectType(classOf[Row])
+  }
+
+  private def constructorFor(schema: StructType): Expression = {
+    val fields = schema.zipWithIndex.map { case (f, i) =>
+      val field = BoundReference(i, f.dataType, f.nullable)
+      If(
+        IsNull(field),
+        Literal.create(null, externalDataTypeFor(f.dataType)),
+        constructorFor(BoundReference(i, f.dataType, f.nullable), f.dataType)
+      )
+    }
+    CreateRow(fields)
+  }
+
+  private def constructorFor(input: Expression, dataType: DataType): Expression = dataType match {
+    case BooleanType | ByteType | ShortType | IntegerType | LongType |
+         FloatType | DoubleType | BinaryType => input
+
+    case TimestampType =>
+      StaticInvoke(
+        DateTimeUtils,
+        ObjectType(classOf[java.sql.Timestamp]),
+        "toJavaTimestamp",
+        input :: Nil)
+
+    case DateType =>
+      StaticInvoke(
+        DateTimeUtils,
+        ObjectType(classOf[java.sql.Date]),
+        "toJavaDate",
+        input :: Nil)
+
+    case _: DecimalType =>
+      Invoke(input, "toJavaBigDecimal", ObjectType(classOf[java.math.BigDecimal]))
+
+    case StringType =>
+      Invoke(input, "toString", ObjectType(classOf[String]))
+
+    case ArrayType(et, nullable) =>
+      val arrayData =
+        Invoke(
+          MapObjects(constructorFor(_, et), input, et),
+          "array",
+          ObjectType(classOf[Array[_]]))
+      StaticInvoke(
+        scala.collection.mutable.WrappedArray,
+        ObjectType(classOf[Seq[_]]),
+        "make",
+        arrayData :: Nil)
+
+    case MapType(kt, vt, valueNullable) =>
+      val keyArrayType = ArrayType(kt, false)
+      val keyData = constructorFor(Invoke(input, "keyArray", keyArrayType), keyArrayType)
+
+      val valueArrayType = ArrayType(vt, valueNullable)
+      val valueData = constructorFor(Invoke(input, "valueArray", valueArrayType), valueArrayType)
+
+      StaticInvoke(
+        ArrayBasedMapData,
+        ObjectType(classOf[Map[_, _]]),
+        "toScalaMap",
+        keyData :: valueData :: Nil)
+
+    case StructType(fields) =>
+      val convertedFields = fields.zipWithIndex.map { case (f, i) =>
+        If(
+          Invoke(input, "isNullAt", BooleanType, Literal(i) :: Nil),
+          Literal.create(null, externalDataTypeFor(f.dataType)),
+          constructorFor(getField(input, i, f.dataType), f.dataType))
+      }
+      CreateRow(convertedFields)
+  }
+
+  private def getField(
+     row: Expression,
+     ordinal: Int,
+     dataType: DataType): Expression = dataType match {
+    case BooleanType =>
+      Invoke(row, "getBoolean", dataType, Literal(ordinal) :: Nil)
+    case ByteType =>
+      Invoke(row, "getByte", dataType, Literal(ordinal) :: Nil)
+    case ShortType =>
+      Invoke(row, "getShort", dataType, Literal(ordinal) :: Nil)
+    case IntegerType | DateType =>
+      Invoke(row, "getInt", dataType, Literal(ordinal) :: Nil)
+    case LongType | TimestampType =>
+      Invoke(row, "getLong", dataType, Literal(ordinal) :: Nil)
+    case FloatType =>
+      Invoke(row, "getFloat", dataType, Literal(ordinal) :: Nil)
+    case DoubleType =>
+      Invoke(row, "getDouble", dataType, Literal(ordinal) :: Nil)
+    case t: DecimalType =>
+      Invoke(row, "getDecimal", dataType, Seq(ordinal, t.precision, t.scale).map(Literal(_)))
+    case StringType =>
+      Invoke(row, "getUTF8String", dataType, Literal(ordinal) :: Nil)
+    case BinaryType =>
+      Invoke(row, "getBinary", dataType, Literal(ordinal) :: Nil)
+    case CalendarIntervalType =>
+      Invoke(row, "getInterval", dataType, Literal(ordinal) :: Nil)
+    case t: StructType =>
+      Invoke(row, "getStruct", dataType, Literal(ordinal) :: Literal(t.size) :: Nil)
+    case _: ArrayType =>
+      Invoke(row, "getArray", dataType, Literal(ordinal) :: Nil)
+    case _: MapType =>
+      Invoke(row, "getMap", dataType, Literal(ordinal) :: Nil)
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
index e8c1c93cf5620..8fc00ad1bcb04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects.scala
@@ -17,12 +17,13 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
 import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation}
 
 import scala.language.existentials
 
-import org.apache.spark.sql.catalyst.{ScalaReflection, InternalRow}
+import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
 import org.apache.spark.sql.types._
 
@@ -364,6 +365,10 @@ case class MapObjects(
       (".numElements()", (i: String) => s".getShort($i)", true)
     case ArrayType(BooleanType, _) =>
       (".numElements()", (i: String) => s".getBoolean($i)", true)
+    case ArrayType(StringType, _) =>
+      (".numElements()", (i: String) => s".getUTF8String($i)", false)
+    case ArrayType(_: MapType, _) =>
+      (".numElements()", (i: String) => s".getMap($i)", false)
   }
 
   override def nullable: Boolean = true
@@ -398,7 +403,7 @@ case class MapObjects(
     val convertedArray = ctx.freshName("convertedArray")
     val loopIndex = ctx.freshName("loopIndex")
 
-    val convertedType = ctx.javaType(boundFunction.dataType)
+    val convertedType = ctx.boxedType(boundFunction.dataType)
 
     // Because of the way Java defines nested arrays, we have to handle the syntax specially.
     // Specifically, we have to insert the [$dataLength] in between the type and any extra nested
@@ -434,9 +439,13 @@ case class MapObjects(
             ($elementJavaType)${genInputData.value}${itemAccessor(loopIndex)};
           $loopNullCheck
 
-          ${genFunction.code}
+          if ($loopIsNull) {
+            $convertedArray[$loopIndex] = null;
+          } else {
+            ${genFunction.code}
+            $convertedArray[$loopIndex] = ${genFunction.value};
+          }
 
-          $convertedArray[$loopIndex] = ($convertedType)${genFunction.value};
           $loopIndex += 1;
         }
 
@@ -446,3 +455,32 @@ case class MapObjects(
     """
   }
 }
+
+case class CreateRow(children: Seq[Expression]) extends Expression {
+  override def dataType: DataType = ObjectType(classOf[Row])
+
+  override def nullable: Boolean = false
+
+  override def eval(input: InternalRow): Any =
+    throw new UnsupportedOperationException("Only code-generated evaluation is supported")
+
+  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
+    val rowClass = classOf[GenericRow].getName
+    val values = ctx.freshName("values")
+    s"""
+      boolean ${ev.isNull} = false;
+      final Object[] $values = new Object[${children.size}];
+    """ +
+      children.zipWithIndex.map { case (e, i) =>
+        val eval = e.gen(ctx)
+        eval.code + s"""
+          if (${eval.isNull}) {
+            $values[$i] = null;
+          } else {
+            $values[$i] = ${eval.value};
+          }
+         """
+      }.mkString("\n") +
+      s"final ${classOf[Row].getName} ${ev.value} = new $rowClass($values);"
+  }
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
index 5f22e59d5f1d8..e5ffe32217351 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/ArrayBasedMapData.scala
@@ -66,4 +66,8 @@ object ArrayBasedMapData {
   def toScalaMap(keys: Array[Any], values: Array[Any]): Map[Any, Any] = {
     keys.zip(values).toMap
   }
+
+  def toScalaMap(keys: Seq[Any], values: Seq[Any]): Map[Any, Any] = {
+    keys.zip(values).toMap
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
index e48395028e399..7614f055e9c04 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala
@@ -148,7 +148,7 @@ object RandomDataGenerator {
         () => BigDecimal.apply(
           rand.nextLong() % math.pow(10, precision).toLong,
           scale,
-          new MathContext(precision)))
+          new MathContext(precision)).bigDecimal)
       case DoubleType => randomNumeric[Double](
         rand, r => longBitsToDouble(r.nextLong()), Seq(Double.MinValue, Double.MinPositiveValue,
           Double.MaxValue, Double.PositiveInfinity, Double.NegativeInfinity, Double.NaN, 0.0))
@@ -166,7 +166,7 @@ object RandomDataGenerator {
       case NullType => Some(() => null)
       case ArrayType(elementType, containsNull) => {
         forType(elementType, nullable = containsNull, seed = Some(rand.nextLong())).map {
-          elementGenerator => () => Array.fill(rand.nextInt(MAX_ARR_SIZE))(elementGenerator())
+          elementGenerator => () => Seq.fill(rand.nextInt(MAX_ARR_SIZE))(elementGenerator())
         }
       }
       case MapType(keyType, valueType, valueContainsNull) => {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
new file mode 100644
index 0000000000000..6041b62b74bdd
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/RowEncoderSuite.scala
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.encoders
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.{RandomDataGenerator, Row}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+class RowEncoderSuite extends SparkFunSuite {
+
+  private val structOfString = new StructType().add("str", StringType)
+  private val arrayOfString = ArrayType(StringType)
+  private val mapOfString = MapType(StringType, StringType)
+
+  encodeDecodeTest(
+    new StructType()
+      .add("boolean", BooleanType)
+      .add("byte", ByteType)
+      .add("short", ShortType)
+      .add("int", IntegerType)
+      .add("long", LongType)
+      .add("float", FloatType)
+      .add("double", DoubleType)
+      .add("decimal", DecimalType.SYSTEM_DEFAULT)
+      .add("string", StringType)
+      .add("binary", BinaryType)
+      .add("date", DateType)
+      .add("timestamp", TimestampType))
+
+  encodeDecodeTest(
+    new StructType()
+      .add("arrayOfString", arrayOfString)
+      .add("arrayOfArrayOfString", ArrayType(arrayOfString))
+      .add("arrayOfArrayOfInt", ArrayType(ArrayType(IntegerType)))
+      .add("arrayOfMap", ArrayType(mapOfString))
+      .add("arrayOfStruct", ArrayType(structOfString)))
+
+  encodeDecodeTest(
+    new StructType()
+      .add("mapOfIntAndString", MapType(IntegerType, StringType))
+      .add("mapOfStringAndArray", MapType(StringType, arrayOfString))
+      .add("mapOfArrayAndInt", MapType(arrayOfString, IntegerType))
+      .add("mapOfArray", MapType(arrayOfString, arrayOfString))
+      .add("mapOfStringAndStruct", MapType(StringType, structOfString))
+      .add("mapOfStructAndString", MapType(structOfString, StringType))
+      .add("mapOfStruct", MapType(structOfString, structOfString)))
+
+  encodeDecodeTest(
+    new StructType()
+      .add("structOfString", structOfString)
+      .add("structOfStructOfString", new StructType().add("struct", structOfString))
+      .add("structOfArray", new StructType().add("array", arrayOfString))
+      .add("structOfMap", new StructType().add("map", mapOfString))
+      .add("structOfArrayAndMap",
+        new StructType().add("array", arrayOfString).add("map", mapOfString)))
+
+  private def encodeDecodeTest(schema: StructType): Unit = {
+    test(s"encode/decode: ${schema.simpleString}") {
+      val encoder = RowEncoder(schema)
+      val inputGenerator = RandomDataGenerator.forType(schema).get
+
+      var input: Row = null
+      try {
+        for (_ <- 1 to 5) {
+          input = inputGenerator.apply().asInstanceOf[Row]
+          val row = encoder.toRow(input)
+          val convertedBack = encoder.fromRow(row)
+          assert(input == convertedBack)
+        }
+      } catch {
+        case e: Exception =>
+          fail(
+            s"""
+               |schema: ${schema.simpleString}
+               |input: ${input}
+             """.stripMargin, e)
+      }
+    }
+  }
+}

From 49ea0e9d7ce805d312d94a5b2936eec2053bc052 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Wed, 21 Oct 2015 11:10:32 -0700
Subject: [PATCH 131/139] [SPARK-10534] [SQL] ORDER BY clause allows only
 columns that are present in the select projection list

Find out the missing attributes by recursively looking
at the sort order expression and rest of the code
takes care of projecting them out.

Added description from cloud-fan

I wanna explain a bit more about this bug.

When we resolve sort ordering, we will use a special method, which only resolves UnresolvedAttributes and UnresolvedExtractValue. However, for something like Floor('a), even the 'a is resolved, the floor expression may still being unresolved as data type mismatch(for example, 'a is string type and Floor need double type), thus can't pass this filter, and we can't push down this missing attribute 'a

Author: Dilip Biswal <dbiswal@us.ibm.com>

Closes #9123 from dilipbiswal/SPARK-10534.
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  |  2 +-
 .../spark/sql/catalyst/analysis/AnalysisSuite.scala    | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index e6046055bf0f6..9237f2f3ddb69 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -482,7 +482,7 @@ class Analyzer(
       val newOrdering = resolveSortOrders(ordering, grandchild, throws = true)
       // Construct a set that contains all of the attributes that we need to evaluate the
       // ordering.
-      val requiredAttributes = AttributeSet(newOrdering.filter(_.resolved))
+      val requiredAttributes = AttributeSet(newOrdering).filter(_.resolved)
       // Figure out which ones are missing from the projection, so that we can add them and
       // remove them after the sort.
       val missingInProject = requiredAttributes -- child.output
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index ec05cfa63c5bf..24af8483a7ecc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -143,4 +143,14 @@ class AnalysisSuite extends AnalysisTest {
     plan = testRelation.select(CreateStructUnsafe(Seq(a, (a + 1).as("a+1"))).as("col"))
     checkAnalysis(plan, plan)
   }
+
+  test("SPARK-10534: resolve attribute references in order by clause") {
+    val a = testRelation2.output(0)
+    val c = testRelation2.output(2)
+
+    val plan = testRelation2.select('c).orderBy(Floor('a).asc)
+    val expected = testRelation2.select(c, a).orderBy(Floor(a.cast(DoubleType)).asc).select(c)
+
+    checkAnalysis(plan, expected)
+  }
 }

From 7c74ebca05f40a2d8fe8f10f24a10486ce4f76c0 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <cloud0fan@163.com>
Date: Wed, 21 Oct 2015 13:22:35 -0700
Subject: [PATCH 132/139] [SPARK-10743][SQL] keep the name of expression if
 possible when do cast

Author: Wenchen Fan <cloud0fan@163.com>

Closes #8859 from cloud-fan/cast.
---
 .../sql/catalyst/analysis/Analyzer.scala      | 33 +++++++++----------
 .../scala/org/apache/spark/sql/Column.scala   |  4 +--
 .../spark/sql/ColumnExpressionSuite.scala     |  6 ----
 .../org/apache/spark/sql/DataFrameSuite.scala |  5 +++
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 9237f2f3ddb69..016dc293f4f47 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -141,32 +141,31 @@ class Analyzer(
    */
   object ResolveAliases extends Rule[LogicalPlan] {
     private def assignAliases(exprs: Seq[NamedExpression]) = {
-      // The `UnresolvedAlias`s will appear only at root of a expression tree, we don't need
-      // to traverse the whole tree.
       exprs.zipWithIndex.map {
-        case (u @ UnresolvedAlias(child), i) =>
-          child match {
-            case _: UnresolvedAttribute => u
-            case ne: NamedExpression => ne
-            case g: Generator if g.resolved && g.elementTypes.size > 1 => MultiAlias(g, Nil)
-            case e if !e.resolved => u
-            case other => Alias(other, s"_c$i")()
+        case (expr, i) =>
+          expr transform {
+            case u @ UnresolvedAlias(child) => child match {
+              case ne: NamedExpression => ne
+              case e if !e.resolved => u
+              case g: Generator if g.elementTypes.size > 1 => MultiAlias(g, Nil)
+              case c @ Cast(ne: NamedExpression, _) => Alias(c, ne.name)()
+              case other => Alias(other, s"_c$i")()
+            }
           }
-        case (other, _) => other
-      }
+      }.asInstanceOf[Seq[NamedExpression]]
     }
 
+    private def hasUnresolvedAlias(exprs: Seq[NamedExpression]) =
+      exprs.exists(_.find(_.isInstanceOf[UnresolvedAlias]).isDefined)
+
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
-      case Aggregate(groups, aggs, child)
-        if child.resolved && aggs.exists(_.isInstanceOf[UnresolvedAlias]) =>
+      case Aggregate(groups, aggs, child) if child.resolved && hasUnresolvedAlias(aggs) =>
         Aggregate(groups, assignAliases(aggs), child)
 
-      case g: GroupingAnalytics
-        if g.child.resolved && g.aggregations.exists(_.isInstanceOf[UnresolvedAlias]) =>
+      case g: GroupingAnalytics if g.child.resolved && hasUnresolvedAlias(g.aggregations) =>
         g.withNewAggs(assignAliases(g.aggregations))
 
-      case Project(projectList, child)
-        if child.resolved && projectList.exists(_.isInstanceOf[UnresolvedAlias]) =>
+      case Project(projectList, child) if child.resolved && hasUnresolvedAlias(projectList) =>
         Project(assignAliases(projectList), child)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
index 1f826887ac774..37d559c8e4301 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Column.scala
@@ -835,8 +835,8 @@ class Column(protected[sql] val expr: Expression) extends Logging {
    * @since 1.3.0
    */
   def cast(to: DataType): Column = expr match {
-    // Lift alias out of cast so we can support col.as("name").cast(IntegerType)
-    case Alias(childExpr, name) => Alias(Cast(childExpr, to), name)()
+    // keeps the name of expression if possible when do cast.
+    case ne: NamedExpression => UnresolvedAlias(Cast(expr, to))
     case _ => Cast(expr, to)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 4e988f074b113..fa559c9c64005 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -588,12 +588,6 @@ class ColumnExpressionSuite extends QueryTest with SharedSQLContext {
     }
   }
 
-  test("lift alias out of cast") {
-    compareExpressions(
-      col("1234").as("name").cast("int").expr,
-      col("1234").cast("int").as("name").expr)
-  }
-
   test("columns can be compared") {
     assert('key.desc == 'key.desc)
     assert('key.desc != 'key.asc)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 832ea02cb6e77..6424f1f1d9cf6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -975,4 +975,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext {
       expected(except)
     )
   }
+
+  test("SPARK-10743: keep the name of expression if possible when do cast") {
+    val df = (1 to 10).map(Tuple1.apply).toDF("i").as("src")
+    assert(df.select($"src.i".cast(StringType)).columns.head === "i")
+  }
 }

From f8c6bec65784de89b47e96a367d3f9790c1b3115 Mon Sep 17 00:00:00 2001
From: Davies Liu <davies@databricks.com>
Date: Wed, 21 Oct 2015 13:38:30 -0700
Subject: [PATCH 133/139] [SPARK-11197][SQL] run SQL on files directly

This PR introduce a new feature to run SQL directly on files without create a table, for example:

```
select id from json.`path/to/json/files` as j
```

Author: Davies Liu <davies@databricks.com>

Closes #9173 from davies/source.
---
 R/pkg/inst/tests/test_sparkSQL.R              |  2 +-
 .../sql/catalyst/analysis/Analyzer.scala      | 10 +++++--
 .../sql/catalyst/analysis/CheckAnalysis.scala |  3 ++
 .../sql/catalyst/analysis/AnalysisSuite.scala |  2 +-
 .../scala/org/apache/spark/sql/SQLConf.scala  |  8 ++++++
 .../org/apache/spark/sql/SQLContext.scala     |  2 +-
 .../sql/execution/datasources/rules.scala     | 28 +++++++++++++++++--
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 28 +++++++++++++++++++
 .../apache/spark/sql/hive/HiveContext.scala   |  4 +--
 .../sql/hive/execution/SQLQuerySuite.scala    | 13 +++++++++
 10 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index e1b42b0804933..67d8b23cd7b8d 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1428,7 +1428,7 @@ test_that("sampleBy() on a DataFrame", {
 
 test_that("SQL error message is returned from JVM", {
   retError <- tryCatch(sql(sqlContext, "select * from blah"), error = function(e) e)
-  expect_equal(grepl("Table Not Found: blah", retError), TRUE)
+  expect_equal(grepl("Table not found: blah", retError), TRUE)
 })
 
 test_that("Method as.data.frame as a synonym for collect()", {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 016dc293f4f47..beabacfc88e32 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -256,7 +256,7 @@ class Analyzer(
         catalog.lookupRelation(u.tableIdentifier, u.alias)
       } catch {
         case _: NoSuchTableException =>
-          u.failAnalysis(s"Table Not Found: ${u.tableName}")
+          u.failAnalysis(s"Table not found: ${u.tableName}")
       }
     }
 
@@ -264,7 +264,13 @@ class Analyzer(
       case i @ InsertIntoTable(u: UnresolvedRelation, _, _, _, _) =>
         i.copy(table = EliminateSubQueries(getTable(u)))
       case u: UnresolvedRelation =>
-        getTable(u)
+        try {
+          getTable(u)
+        } catch {
+          case _: AnalysisException if u.tableIdentifier.database.isDefined =>
+            // delay the exception into CheckAnalysis, then it could be resolved as data source.
+            u
+        }
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 7701fd0451041..ab215407f7da5 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -49,6 +49,9 @@ trait CheckAnalysis {
     plan.foreachUp {
       case p if p.analyzed => // Skip already analyzed sub-plans
 
+      case u: UnresolvedRelation =>
+        u.failAnalysis(s"Table not found: ${u.tableIdentifier}")
+
       case operator: LogicalPlan =>
         operator transformExpressionsUp {
           case a: Attribute if !a.resolved =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 24af8483a7ecc..0a1fa74bede76 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -78,7 +78,7 @@ class AnalysisSuite extends AnalysisTest {
 
   test("resolve relations") {
     assertAnalysisError(
-      UnresolvedRelation(TableIdentifier("tAbLe"), None), Seq("Table Not Found: tAbLe"))
+      UnresolvedRelation(TableIdentifier("tAbLe"), None), Seq("Table not found: tAbLe"))
 
     checkAnalysis(UnresolvedRelation(TableIdentifier("TaBlE"), None), testRelation)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
index b08cc8e830737..6f2892085a8f8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLConf.scala
@@ -432,6 +432,12 @@ private[spark] object SQLConf {
   val USE_SQL_AGGREGATE2 = booleanConf("spark.sql.useAggregate2",
     defaultValue = Some(true), doc = "<TODO>")
 
+  val RUN_SQL_ON_FILES = booleanConf("spark.sql.runSQLOnFiles",
+    defaultValue = Some(true),
+    isPublic = false,
+    doc = "When true, we could use `datasource`.`path` as table in SQL query"
+  )
+
   object Deprecated {
     val MAPRED_REDUCE_TASKS = "mapred.reduce.tasks"
     val EXTERNAL_SORT = "spark.sql.planner.externalSort"
@@ -540,6 +546,8 @@ private[sql] class SQLConf extends Serializable with CatalystConf {
 
   private[spark] def dataFrameRetainGroupColumns: Boolean = getConf(DATAFRAME_RETAIN_GROUP_COLUMNS)
 
+  private[spark] def runSQLOnFile: Boolean = getConf(RUN_SQL_ON_FILES)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index e83657a60558d..a107639947aa2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -193,7 +193,7 @@ class SQLContext private[sql](
       override val extendedResolutionRules =
         ExtractPythonUDFs ::
         PreInsertCastAndRename ::
-        Nil
+        (if (conf.runSQLOnFile) new ResolveDataSource(self) :: Nil else Nil)
 
       override val extendedCheckRules = Seq(
         datasources.PreWriteCheck(catalog)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
index b00e5680fef9e..abc016bf020d9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/rules.scala
@@ -17,13 +17,37 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.sql.{AnalysisException, SaveMode}
-import org.apache.spark.sql.catalyst.analysis.{Catalog, EliminateSubQueries}
+import org.apache.spark.sql.catalyst.analysis._
 import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast}
 import org.apache.spark.sql.catalyst.plans.logical
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.sources.{BaseRelation, HadoopFsRelation, InsertableRelation}
+import org.apache.spark.sql.{AnalysisException, SQLContext, SaveMode}
+
+/**
+ * Try to replaces [[UnresolvedRelation]]s with [[ResolvedDataSource]].
+ */
+private[sql] class ResolveDataSource(sqlContext: SQLContext) extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
+    case u: UnresolvedRelation if u.tableIdentifier.database.isDefined =>
+      try {
+        val resolved = ResolvedDataSource(
+          sqlContext,
+          userSpecifiedSchema = None,
+          partitionColumns = Array(),
+          provider = u.tableIdentifier.database.get,
+          options = Map("path" -> u.tableIdentifier.table))
+        val plan = LogicalRelation(resolved.relation)
+        u.alias.map(a => Subquery(u.alias.get, plan)).getOrElse(plan)
+      } catch {
+        case e: ClassNotFoundException => u
+        case e: Exception =>
+          // the provider is valid, but failed to create a logical plan
+          u.failAnalysis(e.getMessage)
+      }
+  }
+}
 
 /**
  * A rule to do pre-insert data type casting and field renaming. Before we insert into
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index a35a7f41dd667..298c32290697a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -1796,6 +1796,34 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
     }
   }
 
+  test("run sql directly on files") {
+    val df = sqlContext.range(100)
+    withTempPath(f => {
+      df.write.json(f.getCanonicalPath)
+      checkAnswer(sql(s"select id from json.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select id from `org.apache.spark.sql.json`.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select a.id from json.`${f.getCanonicalPath}` as a"),
+        df)
+    })
+
+    val e1 = intercept[AnalysisException] {
+      sql("select * from in_valid_table")
+    }
+    assert(e1.message.contains("Table not found"))
+
+    val e2 = intercept[AnalysisException] {
+      sql("select * from no_db.no_table")
+    }
+    assert(e2.message.contains("Table not found"))
+
+    val e3 = intercept[AnalysisException] {
+      sql("select * from json.invalid_file")
+    }
+    assert(e3.message.contains("No input paths specified"))
+  }
+
   test("SortMergeJoin returns wrong results when using UnsafeRows") {
     // This test is for the fix of https://issues.apache.org/jira/browse/SPARK-10737.
     // This bug will be triggered when Tungsten is enabled and there are multiple
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 38c195bc7db0e..61f611638fe9f 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -45,7 +45,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.expressions.{Expression, LeafExpression}
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.{InternalRow, ParserDialect, SqlParser}
-import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
+import org.apache.spark.sql.execution.datasources.{ResolveDataSource, DataSourceStrategy, PreInsertCastAndRename, PreWriteCheck}
 import org.apache.spark.sql.execution.ui.SQLListener
 import org.apache.spark.sql.execution.{CacheManager, ExecutedCommand, ExtractPythonUDFs, SetCommand}
 import org.apache.spark.sql.hive.client._
@@ -473,7 +473,7 @@ class HiveContext private[hive](
         ExtractPythonUDFs ::
         ResolveHiveWindowFunction ::
         PreInsertCastAndRename ::
-        Nil
+        (if (conf.runSQLOnFile) new ResolveDataSource(self) :: Nil else Nil)
 
       override val extendedCheckRules = Seq(
         PreWriteCheck(catalog)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index c929ba50680bc..396150be76e83 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -1281,6 +1281,19 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
     }
   }
 
+  test("run sql directly on files") {
+    val df = sqlContext.range(100)
+    withTempPath(f => {
+      df.write.parquet(f.getCanonicalPath)
+      checkAnswer(sql(s"select id from parquet.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select id from `org.apache.spark.sql.parquet`.`${f.getCanonicalPath}`"),
+        df)
+      checkAnswer(sql(s"select a.id from parquet.`${f.getCanonicalPath}` as a"),
+        df)
+    })
+  }
+
   test("correctly parse CREATE VIEW statement") {
     withSQLConf(SQLConf.NATIVE_VIEW.key -> "true") {
       withTable("jt") {

From 3afe448d39dc4877b2f2c62b3059aeb3ced0bd96 Mon Sep 17 00:00:00 2001
From: Yin Huai <yhuai@databricks.com>
Date: Wed, 21 Oct 2015 13:43:17 -0700
Subject: [PATCH 134/139] [SPARK-9740][SPARK-9592][SPARK-9210][SQL] Change the
 default behavior of First/Last to RESPECT NULLS.

I am changing the default behavior of `First`/`Last` to respect null values (the SQL standard default behavior).

https://issues.apache.org/jira/browse/SPARK-9740

Author: Yin Huai <yhuai@databricks.com>

Closes #8113 from yhuai/firstLast.
---
 .../sql/catalyst/analysis/CheckAnalysis.scala |   3 +-
 .../catalyst/analysis/FunctionRegistry.scala  |   2 +
 .../expressions/aggregate/functions.scala     | 105 +++++++++++++++---
 .../expressions/aggregate/utils.scala         |   8 +-
 .../sql/catalyst/expressions/aggregates.scala |  95 ++++++++++++----
 .../spark/sql/expressions/WindowSpec.scala    |  13 ++-
 .../execution/AggregationQuerySuite.scala     |  38 +++++++
 7 files changed, 219 insertions(+), 45 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index ab215407f7da5..98d6637c0601b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -113,7 +113,8 @@ trait CheckAnalysis {
                 failAnalysis(
                   s"expression '${e.prettyString}' is neither present in the group by, " +
                     s"nor is it an aggregate function. " +
-                    "Add to group by or wrap in first() if you don't care which value you get.")
+                    "Add to group by or wrap in first() (or first_value) if you don't care " +
+                    "which value you get.")
               case e if groupingExprs.exists(_.semanticEquals(e)) => // OK
               case e if e.references.isEmpty => // OK
               case e => e.children.foreach(checkValidAggregateExpression)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index ba77b70a378a6..f73b24e3630db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -179,7 +179,9 @@ object FunctionRegistry {
     expression[Average]("avg"),
     expression[Count]("count"),
     expression[First]("first"),
+    expression[First]("first_value"),
     expression[Last]("last"),
+    expression[Last]("last_value"),
     expression[Max]("max"),
     expression[Min]("min"),
     expression[Stddev]("stddev"),
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
index c0bc7ec09c34a..515246d344244 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/functions.scala
@@ -21,6 +21,8 @@ import java.lang.{Long => JLong}
 import java.util
 
 import com.clearspring.analytics.hash.MurmurHash
+
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions._
@@ -118,7 +120,23 @@ case class Count(child: Expression) extends DeclarativeAggregate {
   override val evaluateExpression = Cast(currentCount, LongType)
 }
 
-case class First(child: Expression) extends DeclarativeAggregate {
+/**
+ * Returns the first value of `child` for a group of rows. If the first value of `child`
+ * is `null`, it returns `null` (respecting nulls). Even if [[First]] is used on a already
+ * sorted column, if we do partial aggregation and final aggregation (when mergeExpression
+ * is used) its result will not be deterministic (unless the input table is sorted and has
+ * a single partition, and we use a single reducer to do the aggregation.).
+ * @param child
+ */
+case class First(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -135,24 +153,61 @@ case class First(child: Expression) extends DeclarativeAggregate {
 
   private val first = AttributeReference("first", child.dataType)()
 
-  override val aggBufferAttributes = first :: Nil
+  private val valueSet = AttributeReference("valueSet", BooleanType)()
+
+  override val aggBufferAttributes = first :: valueSet :: Nil
 
   override val initialValues = Seq(
-    /* first = */ Literal.create(null, child.dataType)
+    /* first = */ Literal.create(null, child.dataType),
+    /* valueSet = */ Literal.create(false, BooleanType)
   )
 
-  override val updateExpressions = Seq(
-    /* first = */ If(IsNull(first), child, first)
-  )
+  override val updateExpressions = {
+    if (ignoreNulls) {
+      Seq(
+        /* first = */ If(Or(valueSet, IsNull(child)), first, child),
+        /* valueSet = */ Or(valueSet, IsNotNull(child))
+      )
+    } else {
+      Seq(
+        /* first = */ If(valueSet, first, child),
+        /* valueSet = */ Literal.create(true, BooleanType)
+      )
+    }
+  }
 
-  override val mergeExpressions = Seq(
-    /* first = */ If(IsNull(first.left), first.right, first.left)
-  )
+  override val mergeExpressions = {
+    // For first, we can just check if valueSet.left is set to true. If it is set
+    // to true, we use first.right. If not, we use first.right (even if valueSet.right is
+    // false, we are safe to do so because first.right will be null in this case).
+    Seq(
+      /* first = */ If(valueSet.left, first.left, first.right),
+      /* valueSet = */ Or(valueSet.left, valueSet.right)
+    )
+  }
 
   override val evaluateExpression = first
+
+  override def toString: String = s"FIRST($child)${if (ignoreNulls) " IGNORE NULLS"}"
 }
 
-case class Last(child: Expression) extends DeclarativeAggregate {
+/**
+ * Returns the last value of `child` for a group of rows. If the last value of `child`
+ * is `null`, it returns `null` (respecting nulls). Even if [[Last]] is used on a already
+ * sorted column, if we do partial aggregation and final aggregation (when mergeExpression
+ * is used) its result will not be deterministic (unless the input table is sorted and has
+ * a single partition, and we use a single reducer to do the aggregation.).
+ * @param child
+ */
+case class Last(child: Expression, ignoreNullsExpr: Expression) extends DeclarativeAggregate {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
 
   override def children: Seq[Expression] = child :: Nil
 
@@ -175,15 +230,33 @@ case class Last(child: Expression) extends DeclarativeAggregate {
     /* last = */ Literal.create(null, child.dataType)
   )
 
-  override val updateExpressions = Seq(
-    /* last = */ If(IsNull(child), last, child)
-  )
+  override val updateExpressions = {
+    if (ignoreNulls) {
+      Seq(
+        /* last = */ If(IsNull(child), last, child)
+      )
+    } else {
+      Seq(
+        /* last = */ child
+      )
+    }
+  }
 
-  override val mergeExpressions = Seq(
-    /* last = */ If(IsNull(last.right), last.left, last.right)
-  )
+  override val mergeExpressions = {
+    if (ignoreNulls) {
+      Seq(
+        /* last = */ If(IsNull(last.right), last.left, last.right)
+      )
+    } else {
+      Seq(
+        /* last = */ last.right
+      )
+    }
+  }
 
   override val evaluateExpression = last
+
+  override def toString: String = s"LAST($child)${if (ignoreNulls) " IGNORE NULLS"}"
 }
 
 case class Max(child: Expression) extends DeclarativeAggregate {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
index f656ccf13b156..12bdab0915801 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/utils.scala
@@ -61,15 +61,15 @@ object Utils {
             mode = aggregate.Complete,
             isDistinct = true)
 
-        case expressions.First(child) =>
+        case expressions.First(child, ignoreNulls) =>
           aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.First(child),
+            aggregateFunction = aggregate.First(child, ignoreNulls),
             mode = aggregate.Complete,
             isDistinct = false)
 
-        case expressions.Last(child) =>
+        case expressions.Last(child, ignoreNulls) =>
           aggregate.AggregateExpression2(
-            aggregateFunction = aggregate.Last(child),
+            aggregateFunction = aggregate.Last(child, ignoreNulls),
             mode = aggregate.Complete,
             isDistinct = false)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
index f1c47f39043c8..95061c4635879 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregates.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLog
 
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
@@ -630,59 +631,113 @@ case class CombineSetsAndSumFunction(
   }
 }
 
-case class First(child: Expression) extends UnaryExpression with PartialAggregate1 {
+case class First(
+    child: Expression,
+    ignoreNullsExpr: Expression)
+  extends UnaryExpression with PartialAggregate1 {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
+
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
-  override def toString: String = s"FIRST($child)"
+  override def toString: String = s"FIRST(${child}${if (ignoreNulls) " IGNORE NULLS"})"
 
   override def asPartial: SplitEvaluation = {
-    val partialFirst = Alias(First(child), "PartialFirst")()
+    val partialFirst = Alias(First(child, ignoreNulls), "PartialFirst")()
     SplitEvaluation(
-      First(partialFirst.toAttribute),
+      First(partialFirst.toAttribute, ignoreNulls),
       partialFirst :: Nil)
   }
-  override def newInstance(): FirstFunction = new FirstFunction(child, this)
+  override def newInstance(): FirstFunction = new FirstFunction(child, ignoreNulls, this)
 }
 
-case class FirstFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
-  def this() = this(null, null) // Required for serialization.
+object First {
+  def apply(child: Expression): First = First(child, ignoreNulls = false)
 
-  var result: Any = null
+  def apply(child: Expression, ignoreNulls: Boolean): First =
+    First(child, Literal.create(ignoreNulls, BooleanType))
+}
+
+case class FirstFunction(
+    expr: Expression,
+    ignoreNulls: Boolean,
+    base: AggregateExpression1)
+  extends AggregateFunction1 {
+
+  def this() = this(null, null.asInstanceOf[Boolean], null) // Required for serialization.
+
+  private[this] var result: Any = null
+
+  private[this] var valueSet: Boolean = false
 
   override def update(input: InternalRow): Unit = {
-    // We ignore null values.
-    if (result == null) {
-      result = expr.eval(input)
+    if (!valueSet) {
+      val value = expr.eval(input)
+      // When we have not set the result, we will set the result if we respect nulls
+      // (i.e. ignoreNulls is false), or we ignore nulls and the evaluated value is not null.
+      if (!ignoreNulls || (ignoreNulls && value != null)) {
+        result = value
+        valueSet = true
+      }
     }
   }
 
   override def eval(input: InternalRow): Any = result
 }
 
-case class Last(child: Expression) extends UnaryExpression with PartialAggregate1 {
+case class Last(
+    child: Expression,
+    ignoreNullsExpr: Expression)
+  extends UnaryExpression with PartialAggregate1 {
+
+  def this(child: Expression) = this(child, Literal.create(false, BooleanType))
+
+  private val ignoreNulls: Boolean = ignoreNullsExpr match {
+    case Literal(b: Boolean, BooleanType) => b
+    case _ =>
+      throw new AnalysisException("The second argument of First should be a boolean literal.")
+  }
+
   override def references: AttributeSet = child.references
   override def nullable: Boolean = true
   override def dataType: DataType = child.dataType
-  override def toString: String = s"LAST($child)"
+  override def toString: String = s"LAST($child)${if (ignoreNulls) " IGNORE NULLS"}"
 
   override def asPartial: SplitEvaluation = {
-    val partialLast = Alias(Last(child), "PartialLast")()
+    val partialLast = Alias(Last(child, ignoreNulls), "PartialLast")()
     SplitEvaluation(
-      Last(partialLast.toAttribute),
+      Last(partialLast.toAttribute, ignoreNulls),
       partialLast :: Nil)
   }
-  override def newInstance(): LastFunction = new LastFunction(child, this)
+  override def newInstance(): LastFunction = new LastFunction(child, ignoreNulls, this)
 }
 
-case class LastFunction(expr: Expression, base: AggregateExpression1) extends AggregateFunction1 {
-  def this() = this(null, null) // Required for serialization.
+object Last {
+  def apply(child: Expression): Last = Last(child, ignoreNulls = false)
+
+  def apply(child: Expression, ignoreNulls: Boolean): Last =
+    Last(child, Literal.create(ignoreNulls, BooleanType))
+}
+
+case class LastFunction(
+    expr: Expression,
+    ignoreNulls: Boolean,
+    base: AggregateExpression1)
+  extends AggregateFunction1 {
+
+  def this() = this(null, null.asInstanceOf[Boolean], null) // Required for serialization.
 
   var result: Any = null
 
   override def update(input: InternalRow): Unit = {
     val value = expr.eval(input)
-    // We ignore null values.
-    if (value != null) {
+    if (!ignoreNulls || (ignoreNulls && value != null)) {
       result = value
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
index c3d2246297021..8b9247adea200 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.expressions
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.sql.types.BooleanType
 import org.apache.spark.sql.{Column, catalyst}
 import org.apache.spark.sql.catalyst.expressions._
 
@@ -149,13 +150,17 @@ class WindowSpec private[sql](
       case Count(child) => WindowExpression(
         UnresolvedWindowFunction("count", child :: Nil),
         WindowSpecDefinition(partitionSpec, orderSpec, frame))
-      case First(child) => WindowExpression(
+      case First(child, ignoreNulls) => WindowExpression(
         // TODO this is a hack for Hive UDAF first_value
-        UnresolvedWindowFunction("first_value", child :: Nil),
+        UnresolvedWindowFunction(
+          "first_value",
+          child :: ignoreNulls :: Nil),
         WindowSpecDefinition(partitionSpec, orderSpec, frame))
-      case Last(child) => WindowExpression(
+      case Last(child, ignoreNulls) => WindowExpression(
         // TODO this is a hack for Hive UDAF last_value
-        UnresolvedWindowFunction("last_value", child :: Nil),
+        UnresolvedWindowFunction(
+          "last_value",
+          child :: ignoreNulls :: Nil),
         WindowSpecDefinition(partitionSpec, orderSpec, frame))
       case Min(child) => WindowExpression(
         UnresolvedWindowFunction("min", child :: Nil),
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
index c9e1bb199591d..f38a3f63c3b58 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/AggregationQuerySuite.scala
@@ -323,6 +323,44 @@ abstract class AggregationQuerySuite extends QueryTest with SQLTestUtils with Te
       Row(11.125) :: Nil)
   }
 
+  test("first_value and last_value") {
+    // We force to use a single partition for the sort and aggregate to make result
+    // deterministic.
+    withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+      checkAnswer(
+        sqlContext.sql(
+          """
+            |SELECT
+            |  first_valUE(key),
+            |  lasT_value(key),
+            |  firSt(key),
+            |  lASt(key),
+            |  first_valUE(key, true),
+            |  lasT_value(key, true),
+            |  firSt(key, true),
+            |  lASt(key, true)
+            |FROM (SELECT key FROM agg1 ORDER BY key) tmp
+          """.stripMargin),
+        Row(null, 3, null, 3, 1, 3, 1, 3) :: Nil)
+
+      checkAnswer(
+        sqlContext.sql(
+          """
+            |SELECT
+            |  first_valUE(key),
+            |  lasT_value(key),
+            |  firSt(key),
+            |  lASt(key),
+            |  first_valUE(key, true),
+            |  lasT_value(key, true),
+            |  firSt(key, true),
+            |  lASt(key, true)
+            |FROM (SELECT key FROM agg1 ORDER BY key DESC) tmp
+          """.stripMargin),
+        Row(3, null, 3, null, 3, 1, 3, 1) :: Nil)
+    }
+  }
+
   test("udaf") {
     checkAnswer(
       sqlContext.sql(

From a37cd870489d7ff85455bf9b57d9aa5329514058 Mon Sep 17 00:00:00 2001
From: Artem Aliev <artem.aliev@datastax.com>
Date: Wed, 21 Oct 2015 14:12:00 -0700
Subject: [PATCH 135/139] [SPARK-11208][SQL] Filter out
 'hive.metastore.rawstore.impl' from executionHive temporary config

The executionHive assumed to be a standard meta store located in temporary directory as a derby db. But hive.metastore.rawstore.impl was not filtered out so any custom implementation of the metastore with other storage properties (not JDO) will persist that temporary functions. CassandraHiveMetaStore from DataStax Enterprise is one of examples.

Author: Artem Aliev <artem.aliev@datastax.com>

Closes #9178 from artem-aliev/SPARK-11208.
---
 .../src/main/scala/org/apache/spark/sql/hive/HiveContext.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index 61f611638fe9f..c7460d46b6370 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -725,7 +725,8 @@ private[hive] object HiveContext {
     // We have to mask all properties in hive-site.xml that relates to metastore data source
     // as we used a local metastore here.
     HiveConf.ConfVars.values().foreach { confvar =>
-      if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")) {
+      if (confvar.varname.contains("datanucleus") || confvar.varname.contains("jdo")
+        || confvar.varname.contains("hive.metastore.rawstore.impl")) {
         propMap.put(confvar.varname, confvar.getDefaultExpr())
       }
     }

From 19ad18638e27cc7b403ea98c4f9f40a940932e30 Mon Sep 17 00:00:00 2001
From: Shagun Sodhani <sshagunsodhani@gmail.com>
Date: Wed, 21 Oct 2015 14:18:06 -0700
Subject: [PATCH 136/139] [SPARK-11233][SQL] register cosh in function registry

Author: Shagun Sodhani <sshagunsodhani@gmail.com>

Closes #9199 from shagunsodhani/proposed-fix-#11233.
---
 .../apache/spark/sql/catalyst/analysis/FunctionRegistry.scala    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
index f73b24e3630db..3dce6c1a27e85 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala
@@ -142,6 +142,7 @@ object FunctionRegistry {
     expression[Ceil]("ceil"),
     expression[Ceil]("ceiling"),
     expression[Cos]("cos"),
+    expression[Cosh]("cosh"),
     expression[Conv]("conv"),
     expression[EulerNumber]("e"),
     expression[Exp]("exp"),

From dce2f8c9d74bda46f3c7a7ebe7372b04d6b17a61 Mon Sep 17 00:00:00 2001
From: Dilip Biswal <dbiswal@us.ibm.com>
Date: Wed, 21 Oct 2015 14:29:03 -0700
Subject: [PATCH 137/139] [SPARK-8654][SQL] Analysis exception when using NULL
 IN (...) : invalid cast

In the analysis phase , while processing the rules for IN predicate, we
compare the in-list types to the lhs expression type and generate
cast operation if necessary. In the case of NULL [NOT] IN expr1 , we end up
generating cast between in list types to NULL like cast (1 as NULL) which
is not a valid cast.

The fix is to find a common type between LHS and RHS expressions and cast
all the expression to the common type.

Author: Dilip Biswal <dbiswal@us.ibm.com>

This patch had conflicts when merged, resolved by
Committer: Michael Armbrust <michael@databricks.com>

Closes #9036 from dilipbiswal/spark_8654_new.
---
 .../catalyst/analysis/HiveTypeCoercion.scala  | 11 ++++++++--
 .../sql/catalyst/analysis/AnalysisSuite.scala | 21 +++++++++++++++++++
 .../analysis/HiveTypeCoercionSuite.scala      |  3 ++-
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
index 87a3845b2d9e5..1140150f66864 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -304,7 +304,11 @@ object HiveTypeCoercion {
   }
 
   /**
-   * Convert all expressions in in() list to the left operator type
+   * Convert the value and in list expressions to the common operator type
+   * by looking at all the argument types and finding the closest one that
+   * all the arguments can be cast to. When no common operator type is found
+   * the original expression will be returned and an Analysis Exception will
+   * be raised at type checking phase.
    */
   object InConversion extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan resolveExpressions {
@@ -312,7 +316,10 @@ object HiveTypeCoercion {
       case e if !e.childrenResolved => e
 
       case i @ In(a, b) if b.exists(_.dataType != a.dataType) =>
-        i.makeCopy(Array(a, b.map(Cast(_, a.dataType))))
+        findWiderCommonType(i.children.map(_.dataType)) match {
+          case Some(finalDataType) => i.withNewChildren(i.children.map(Cast(_, finalDataType)))
+          case None => i
+        }
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index 0a1fa74bede76..71d2939ecffe6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -153,4 +153,25 @@ class AnalysisSuite extends AnalysisTest {
 
     checkAnalysis(plan, expected)
   }
+
+  test("SPARK-8654: invalid CAST in NULL IN(...) expression") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(2))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisSuccess(plan)
+  }
+
+  test("SPARK-8654: different types in inlist but can be converted to a commmon type") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(1), Literal(1.2345))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisSuccess(plan)
+  }
+
+  test("SPARK-8654: check type compatibility error") {
+    val plan = Project(Alias(In(Literal(null), Seq(Literal(true), Literal(1))), "a")() :: Nil,
+      LocalRelation()
+    )
+    assertAnalysisError(plan, Seq("data type mismatch: Arguments must be same type"))
+  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
index 6f33ab733b615..d3fafaae89938 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercionSuite.scala
@@ -470,7 +470,8 @@ class HiveTypeCoercionSuite extends PlanTest {
     )
     ruleTest(inConversion,
       In(Literal("a"), Seq(Literal(1), Literal("b"))),
-      In(Literal("a"), Seq(Cast(Literal(1), StringType), Cast(Literal("b"), StringType)))
+      In(Cast(Literal("a"), StringType),
+        Seq(Cast(Literal(1), StringType), Cast(Literal("b"), StringType)))
     )
   }
 }

From f481090a71940f06602a73f5bbd004980dea026f Mon Sep 17 00:00:00 2001
From: "navis.ryu" <navis@apache.org>
Date: Wed, 21 Oct 2015 15:07:08 -0700
Subject: [PATCH 138/139] [SPARK-10151][SQL] Support invocation of hive macro

Macro in hive (which is GenericUDFMacro) contains real function inside of it but it's not conveyed to tasks, resulting null-pointer exception.

Author: navis.ryu <navis@apache.org>

Closes #8354 from navis/SPARK-10151.
---
 .../sql/hive/execution/HiveCompatibilitySuite.scala      | 1 +
 .../scala/org/apache/spark/sql/hive/HiveContext.scala    | 9 +++++++--
 .../scala/org/apache/spark/sql/hive/HiveInspectors.scala | 4 ++++
 .../main/scala/org/apache/spark/sql/hive/HiveQl.scala    | 2 ++
 .../main/scala/org/apache/spark/sql/hive/HiveShim.scala  | 9 ++++-----
 .../main/scala/org/apache/spark/sql/hive/hiveUDFs.scala  | 5 ++++-
 .../golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada      | 1 +
 .../golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6      | 0
 .../golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c     | 0
 .../golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587     | 1 +
 .../golden/macro-12-62b999122975c2a5de8e49fee089c041     | 0
 .../golden/macro-13-87e53d2b4c84098e662779e8f0a59084     | 1 +
 .../golden/macro-14-3a31df84432674ad410f44b137e32c2d     | 0
 .../golden/macro-15-56966c45104c0d9bc407e79538c2c029     | 0
 .../golden/macro-16-56966c45104c0d9bc407e79538c2c029     | 0
 .../golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548     | 1 +
 .../golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a     | 0
 .../golden/macro-19-e3c828c372607b8bf7be00a99359b662     | 0
 .../golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec      | 1 +
 .../golden/macro-20-cb252a243d59809930a4ff371cbfa292     | 0
 .../golden/macro-21-cb252a243d59809930a4ff371cbfa292     | 0
 .../golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff      | 0
 .../golden/macro-4-86292bbb7f147393c38bca051768dbda      | 0
 .../golden/macro-5-ca270bff813e5ab18a6a799016693aa8      | 0
 .../golden/macro-6-8976be22af3aba0cc4905e014b4e24fe      | 0
 .../golden/macro-7-decde0a59183a393e580941c633d3c5c      | 1 +
 .../golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020      | 0
 .../golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e      | 0
 28 files changed, 28 insertions(+), 8 deletions(-)
 create mode 100644 sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada
 create mode 100644 sql/hive/src/test/resources/golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6
 create mode 100644 sql/hive/src/test/resources/golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c
 create mode 100644 sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587
 create mode 100644 sql/hive/src/test/resources/golden/macro-12-62b999122975c2a5de8e49fee089c041
 create mode 100644 sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084
 create mode 100644 sql/hive/src/test/resources/golden/macro-14-3a31df84432674ad410f44b137e32c2d
 create mode 100644 sql/hive/src/test/resources/golden/macro-15-56966c45104c0d9bc407e79538c2c029
 create mode 100644 sql/hive/src/test/resources/golden/macro-16-56966c45104c0d9bc407e79538c2c029
 create mode 100644 sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548
 create mode 100644 sql/hive/src/test/resources/golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a
 create mode 100644 sql/hive/src/test/resources/golden/macro-19-e3c828c372607b8bf7be00a99359b662
 create mode 100644 sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec
 create mode 100644 sql/hive/src/test/resources/golden/macro-20-cb252a243d59809930a4ff371cbfa292
 create mode 100644 sql/hive/src/test/resources/golden/macro-21-cb252a243d59809930a4ff371cbfa292
 create mode 100644 sql/hive/src/test/resources/golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff
 create mode 100644 sql/hive/src/test/resources/golden/macro-4-86292bbb7f147393c38bca051768dbda
 create mode 100644 sql/hive/src/test/resources/golden/macro-5-ca270bff813e5ab18a6a799016693aa8
 create mode 100644 sql/hive/src/test/resources/golden/macro-6-8976be22af3aba0cc4905e014b4e24fe
 create mode 100644 sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c
 create mode 100644 sql/hive/src/test/resources/golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020
 create mode 100644 sql/hive/src/test/resources/golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e

diff --git a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
index 8f29fa91f7ebb..eed9e436f9af7 100644
--- a/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
+++ b/sql/hive/compatibility/src/test/scala/org/apache/spark/sql/hive/execution/HiveCompatibilitySuite.scala
@@ -684,6 +684,7 @@ class HiveCompatibilitySuite extends HiveQueryFileTest with BeforeAndAfter {
     "load_file_with_space_in_the_name",
     "loadpart1",
     "louter_join_ppr",
+    "macro",
     "mapjoin_distinct",
     "mapjoin_filter_on_outerjoin",
     "mapjoin_mapjoin",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
index c7460d46b6370..c328734df316b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.net.{URL, URLClassLoader}
 import java.sql.Timestamp
 import java.util.concurrent.TimeUnit
+import java.util.regex.Pattern
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.HashMap
@@ -592,10 +593,14 @@ class HiveContext private[hive](
     )
   }
 
+  private def functionOrMacroDDLPattern(command: String) = Pattern.compile(
+    ".*(create|drop)\\s+(temporary\\s+)?(function|macro).+", Pattern.DOTALL).matcher(command)
+
   protected[hive] def runSqlHive(sql: String): Seq[String] = {
-    if (sql.toLowerCase.contains("create temporary function")) {
+    val command = sql.trim.toLowerCase
+    if (functionOrMacroDDLPattern(command).matches()) {
       executionHive.runSqlHive(sql)
-    } else if (sql.trim.toLowerCase.startsWith("set")) {
+    } else if (command.startsWith("set")) {
       metadataHive.runSqlHive(sql)
       executionHive.runSqlHive(sql)
     } else {
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index cfe2bb05ad89e..43c238fd49e0e 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -723,6 +723,10 @@ private[hive] trait HiveInspectors {
         inspectorToDataType(m.getMapValueObjectInspector))
     case _: WritableStringObjectInspector => StringType
     case _: JavaStringObjectInspector => StringType
+    case _: WritableHiveVarcharObjectInspector => StringType
+    case _: JavaHiveVarcharObjectInspector => StringType
+    case _: WritableHiveCharObjectInspector => StringType
+    case _: JavaHiveCharObjectInspector => StringType
     case _: WritableIntObjectInspector => IntegerType
     case _: JavaIntObjectInspector => IntegerType
     case _: WritableDoubleObjectInspector => DoubleType
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index d4ff5cc0f12a2..3697761f20c28 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -117,6 +117,7 @@ private[hive] object HiveQl extends Logging {
     "TOK_CREATEDATABASE",
     "TOK_CREATEFUNCTION",
     "TOK_CREATEINDEX",
+    "TOK_CREATEMACRO",
     "TOK_CREATEROLE",
 
     "TOK_DESCDATABASE",
@@ -125,6 +126,7 @@ private[hive] object HiveQl extends Logging {
     "TOK_DROPDATABASE",
     "TOK_DROPFUNCTION",
     "TOK_DROPINDEX",
+    "TOK_DROPMACRO",
     "TOK_DROPROLE",
     "TOK_DROPTABLE_PROPERTIES",
     "TOK_DROPVIEW",
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index 004805f3aed0b..f0697613cff3b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -117,9 +117,10 @@ private[hive] object HiveShim {
    * Detail discussion can be found at https://github.com/apache/spark/pull/3640
    *
    * @param functionClassName UDF class name
+   * @param instance optional UDF instance which contains additional information (for macro)
    */
-  private[hive] case class HiveFunctionWrapper(var functionClassName: String)
-    extends java.io.Externalizable {
+  private[hive] case class HiveFunctionWrapper(var functionClassName: String,
+    private var instance: AnyRef = null) extends java.io.Externalizable {
 
     // for Serialization
     def this() = this(null)
@@ -154,8 +155,6 @@ private[hive] object HiveShim {
       serializeObjectByKryo(Utilities.runtimeSerializationKryo.get(), function, out)
     }
 
-    private var instance: AnyRef = null
-
     def writeExternal(out: java.io.ObjectOutput) {
       // output the function name
       out.writeUTF(functionClassName)
@@ -184,7 +183,7 @@ private[hive] object HiveShim {
         // read the function in bytes
         val functionInBytesLength = in.readInt()
         val functionInBytes = new Array[Byte](functionInBytesLength)
-        in.read(functionInBytes, 0, functionInBytesLength)
+        in.readFully(functionInBytes)
 
         // deserialize the function object via Hive Utilities
         instance = deserializePlan[AnyRef](new java.io.ByteArrayInputStream(functionInBytes),
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
index a2ebf6552fd06..f57b206999399 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUDFs.scala
@@ -64,7 +64,10 @@ private[hive] class HiveFunctionRegistry(underlying: analysis.FunctionRegistry)
       // don't satisfy the hive UDF, such as type mismatch, input number mismatch, etc. Here we
       // catch the exception and throw AnalysisException instead.
       try {
-        if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
+        if (classOf[GenericUDFMacro].isAssignableFrom(functionInfo.getFunctionClass)) {
+          HiveGenericUDF(
+            new HiveFunctionWrapper(functionClassName, functionInfo.getGenericUDF), children)
+        } else if (classOf[UDF].isAssignableFrom(functionInfo.getFunctionClass)) {
           HiveSimpleUDF(new HiveFunctionWrapper(functionClassName), children)
         } else if (classOf[GenericUDF].isAssignableFrom(functionInfo.getFunctionClass)) {
           HiveGenericUDF(new HiveFunctionWrapper(functionClassName), children)
diff --git a/sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada b/sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-0-50131c0ba7b7a6b65c789a5a8497bada
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6 b/sql/hive/src/test/resources/golden/macro-1-5ff5e8795c13303db5d3ea88e1e918b6
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c b/sql/hive/src/test/resources/golden/macro-10-45148a37f6ee9cf498dc7308cbd81a1c
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587 b/sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587
new file mode 100644
index 0000000000000..573541ac9702d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-11-f55b8684c77f6eefc2618ba79e5e0587
@@ -0,0 +1 @@
+0
diff --git a/sql/hive/src/test/resources/golden/macro-12-62b999122975c2a5de8e49fee089c041 b/sql/hive/src/test/resources/golden/macro-12-62b999122975c2a5de8e49fee089c041
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084 b/sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084
new file mode 100644
index 0000000000000..d00491fd7e5bb
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-13-87e53d2b4c84098e662779e8f0a59084
@@ -0,0 +1 @@
+1
diff --git a/sql/hive/src/test/resources/golden/macro-14-3a31df84432674ad410f44b137e32c2d b/sql/hive/src/test/resources/golden/macro-14-3a31df84432674ad410f44b137e32c2d
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-15-56966c45104c0d9bc407e79538c2c029 b/sql/hive/src/test/resources/golden/macro-15-56966c45104c0d9bc407e79538c2c029
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-16-56966c45104c0d9bc407e79538c2c029 b/sql/hive/src/test/resources/golden/macro-16-56966c45104c0d9bc407e79538c2c029
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548 b/sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548
new file mode 100644
index 0000000000000..f599e28b8ab0d
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-17-b3864f1d19fdb88b3b74f6d74a0ba548
@@ -0,0 +1 @@
+10
diff --git a/sql/hive/src/test/resources/golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a b/sql/hive/src/test/resources/golden/macro-18-bddb2fe17cd4d850c4462b7eb2b9bc2a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-19-e3c828c372607b8bf7be00a99359b662 b/sql/hive/src/test/resources/golden/macro-19-e3c828c372607b8bf7be00a99359b662
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec b/sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec
new file mode 100644
index 0000000000000..b49805ff631c1
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-2-fde44c7854a9897acb4c2f78f24c8eec
@@ -0,0 +1 @@
+0.8807970779778823
diff --git a/sql/hive/src/test/resources/golden/macro-20-cb252a243d59809930a4ff371cbfa292 b/sql/hive/src/test/resources/golden/macro-20-cb252a243d59809930a4ff371cbfa292
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-21-cb252a243d59809930a4ff371cbfa292 b/sql/hive/src/test/resources/golden/macro-21-cb252a243d59809930a4ff371cbfa292
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff b/sql/hive/src/test/resources/golden/macro-3-ddc4cb920b0a68e06551cd34ae4e29ff
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-4-86292bbb7f147393c38bca051768dbda b/sql/hive/src/test/resources/golden/macro-4-86292bbb7f147393c38bca051768dbda
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-5-ca270bff813e5ab18a6a799016693aa8 b/sql/hive/src/test/resources/golden/macro-5-ca270bff813e5ab18a6a799016693aa8
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-6-8976be22af3aba0cc4905e014b4e24fe b/sql/hive/src/test/resources/golden/macro-6-8976be22af3aba0cc4905e014b4e24fe
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c b/sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/macro-7-decde0a59183a393e580941c633d3c5c
@@ -0,0 +1 @@
+2
diff --git a/sql/hive/src/test/resources/golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020 b/sql/hive/src/test/resources/golden/macro-8-3d25ffda9ab348f3e39ad967fc0e5020
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sql/hive/src/test/resources/golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e b/sql/hive/src/test/resources/golden/macro-9-db5f5172704da1e6dd5d59c136b83e7e
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 555b2086a1ee432067de77032f1e3c64735481f0 Mon Sep 17 00:00:00 2001
From: Reynold Xin <rxin@databricks.com>
Date: Wed, 21 Oct 2015 15:33:13 -0700
Subject: [PATCH 139/139] Minor cleanup of ShuffleMapStage.outputLocs code.

I was looking at this code and found the documentation to be insufficient. I added more documentation, and refactored some relevant code path slightly to improve encapsulation. There are more that I want to do, but I want to get these changes in before doing more work.

My goal is to reduce exposing internal fields directly in ShuffleMapStage to improve encapsulation. After this change, DAGScheduler no longer directly writes outputLocs. There are still 3 places that reads outputLocs directly, but we can change those later.

Author: Reynold Xin <rxin@databricks.com>

Closes #9175 from rxin/stage-cleanup.
---
 .../apache/spark/scheduler/DAGScheduler.scala | 28 ++++++-------------
 .../apache/spark/scheduler/ResultStage.scala  |  5 ++++
 .../spark/scheduler/ShuffleMapStage.scala     | 21 ++++++++++++++
 .../org/apache/spark/scheduler/Stage.scala    |  5 +++-
 4 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index ade372be092ae..995862ece5944 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -353,10 +353,12 @@ class DAGScheduler(
     if (mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) {
       val serLocs = mapOutputTracker.getSerializedMapOutputStatuses(shuffleDep.shuffleId)
       val locs = MapOutputTracker.deserializeMapStatuses(serLocs)
-      for (i <- 0 until locs.length) {
-        stage.outputLocs(i) = Option(locs(i)).toList // locs(i) will be null if missing
+      (0 until locs.length).foreach { i =>
+        if (locs(i) ne null) {
+          // locs(i) will be null if missing
+          stage.addOutputLoc(i, locs(i))
+        }
       }
-      stage.numAvailableOutputs = locs.count(_ != null)
     } else {
       // Kind of ugly: need to register RDDs with the cache and map output tracker here
       // since we can't do it in the RDD constructor because # of partitions is unknown
@@ -894,7 +896,7 @@ class DAGScheduler(
     submitStage(finalStage)
 
     // If the whole stage has already finished, tell the listener and remove it
-    if (!finalStage.outputLocs.contains(Nil)) {
+    if (finalStage.isAvailable) {
       markMapStageJobAsFinished(job, mapOutputTracker.getStatistics(dependency))
     }
 
@@ -931,24 +933,12 @@ class DAGScheduler(
     stage.pendingPartitions.clear()
 
     // First figure out the indexes of partition ids to compute.
-    val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
-      stage match {
-        case stage: ShuffleMapStage =>
-          val allPartitions = 0 until stage.numPartitions
-          val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }
-          (allPartitions, filteredPartitions)
-        case stage: ResultStage =>
-          val job = stage.resultOfJob.get
-          val allPartitions = 0 until job.numPartitions
-          val filteredPartitions = allPartitions.filter { id => !job.finished(id) }
-          (allPartitions, filteredPartitions)
-      }
-    }
+    val partitionsToCompute: Seq[Int] = stage.findMissingPartitions()
 
     // Create internal accumulators if the stage has no accumulators initialized.
     // Reset internal accumulators only if this stage is not partially submitted
     // Otherwise, we may override existing accumulator values from some tasks
-    if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {
+    if (stage.internalAccumulators.isEmpty || stage.numPartitions == partitionsToCompute.size) {
       stage.resetInternalAccumulators()
     }
 
@@ -1202,7 +1192,7 @@ class DAGScheduler(
 
               clearCacheLocs()
 
-              if (shuffleStage.outputLocs.contains(Nil)) {
+              if (!shuffleStage.isAvailable) {
                 // Some tasks had failed; let's resubmit this shuffleStage
                 // TODO: Lower-level scheduler should also deal with this
                 logInfo("Resubmitting " + shuffleStage + " (" + shuffleStage.name +
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
index c0451da1f0247..c1d86af7e8fb5 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ResultStage.scala
@@ -43,5 +43,10 @@ private[spark] class ResultStage(
    */
   var resultOfJob: Option[ActiveJob] = None
 
+  override def findMissingPartitions(): Seq[Int] = {
+    val job = resultOfJob.get
+    (0 until job.numPartitions).filter(id => !job.finished(id))
+  }
+
   override def toString: String = "ResultStage " + id
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
index 7d92960876403..3832d99eddaef 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapStage.scala
@@ -48,12 +48,33 @@ private[spark] class ShuffleMapStage(
   /** Running map-stage jobs that were submitted to execute this stage independently (if any) */
   var mapStageJobs: List[ActiveJob] = Nil
 
+  /**
+   * Number of partitions that have shuffle outputs.
+   * When this reaches [[numPartitions]], this map stage is ready.
+   * This should be kept consistent as `outputLocs.filter(!_.isEmpty).size`.
+   */
   var numAvailableOutputs: Int = 0
 
+  /**
+   * Returns true if the map stage is ready, i.e. all partitions have shuffle outputs.
+   * This should be the same as `outputLocs.contains(Nil)`.
+   */
   def isAvailable: Boolean = numAvailableOutputs == numPartitions
 
+  /**
+   * List of [[MapStatus]] for each partition. The index of the array is the map partition id,
+   * and each value in the array is the list of possible [[MapStatus]] for a partition
+   * (a single task might run multiple times).
+   */
   val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
 
+  override def findMissingPartitions(): Seq[Int] = {
+    val missing = (0 until numPartitions).filter(id => outputLocs(id).isEmpty)
+    assert(missing.size == numPartitions - numAvailableOutputs,
+      s"${missing.size} missing, expected ${numPartitions - numAvailableOutputs}")
+    missing
+  }
+
   def addOutputLoc(partition: Int, status: MapStatus): Unit = {
     val prevList = outputLocs(partition)
     outputLocs(partition) = status :: prevList
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index a3829c319c48d..5ce4a484344f1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -61,7 +61,7 @@ private[scheduler] abstract class Stage(
     val callSite: CallSite)
   extends Logging {
 
-  val numPartitions = rdd.partitions.size
+  val numPartitions = rdd.partitions.length
 
   /** Set of jobs that this stage belongs to. */
   val jobIds = new HashSet[Int]
@@ -138,6 +138,9 @@ private[scheduler] abstract class Stage(
     case stage: Stage => stage != null && stage.id == id
     case _ => false
   }
+
+  /** Returns the sequence of partition ids that are missing (i.e. needs to be computed). */
+  def findMissingPartitions(): Seq[Int]
 }
 
 private[scheduler] object Stage {

Property Name	Default	Meaning
`spark.memory.fraction`	0.75	+ Fraction of the heap space used for execution and storage. The lower this is, the more + frequently spills and cached data eviction occur. The purpose of this config is to set + aside memory for internal metadata, user data structures, and imprecise size estimation + in the case of sparse, unusually large records. +
`spark.memory.storageFraction`	0.5	+ The size of the storage region within the space set aside by + `spark.memory.fraction`. This region is not statically reserved, but dynamically + allocated as cache requests come in. Cached data may be evicted only if total storage exceeds + this region. +
`spark.memory.useLegacyMode`	false	+ Whether to enable the legacy memory management mode used in Spark 1.5 and before. + The legacy mode rigidly partitions the heap space into fixed-size regions, + potentially leading to excessive spilling if the application was not tuned. + The following deprecated memory fraction configurations are not read unless this is enabled: + `spark.shuffle.memoryFraction` + `spark.storage.memoryFraction` + `spark.storage.unrollFraction` +
`spark.shuffle.memoryFraction`	0.2	+ (deprecated) This is read only if `spark.memory.useLegacyMode` is enabled. + Fraction of Java heap to use for aggregation and cogroups during shuffles. + At any given time, the collective size of + all in-memory maps used for shuffles is bounded by this limit, beyond which the contents will + begin to spill to disk. If spills are often, consider increasing this value at the expense of + `spark.storage.memoryFraction`. +
`spark.storage.memoryFraction`	0.6	+ (deprecated) This is read only if `spark.memory.useLegacyMode` is enabled. + Fraction of Java heap to use for Spark's memory cache. This should not be larger than the "old" + generation of objects in the JVM, which by default is given 0.6 of the heap, but you can + increase it if you configure your own old generation size. +
`spark.storage.unrollFraction`	0.2	+ (deprecated) This is read only if `spark.memory.useLegacyMode` is enabled. + Fraction of `spark.storage.memoryFraction` to use for unrolling blocks in memory. + This is dynamically allocated by dropping existing blocks when there is not enough free + storage space to unroll the new block in its entirety. +
Property Name	Default	Meaning
`spark.storage.memoryFraction`	0.6	- Fraction of Java heap to use for Spark's memory cache. This should not be larger than the "old" - generation of objects in the JVM, which by default is given 0.6 of the heap, but you can - increase it if you configure your own old generation size. -
`spark.storage.memoryMapThreshold`	2m
`spark.storage.unrollFraction`	0.2	- Fraction of `spark.storage.memoryFraction` to use for unrolling blocks in memory. - This is dynamically allocated by dropping existing blocks when there is not enough free - storage space to unroll the new block in its entirety. -
`spark.externalBlockStore.blockManager`	org.apache.spark.storage.TachyonBlockManager
Input
`spark.sql.ui.retainedExecutions`	1000	+ How many finished executions the Spark UI and status APIs remember before garbage collecting. +
`spark.streaming.ui.retainedBatches`	1000	+ How many finished batches the Spark UI and status APIs remember before garbage collecting. +